In [48]:
import pandas as pd
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn import metrics

In [49]:
df = pd.read_csv('olid-training-v1.0.tsv', sep='\t')

In [50]:
df.head()

Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans wh...,OFF
1,90194,@USER @USER Go home you‚Äôre drunk!!! @USER #M...,OFF
2,16820,Amazon is investigating Chinese employees who ...,NOT
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT


In [51]:
df.isnull().sum()

 id          0
tweet        0
subtask_a    0
dtype: int64

In [52]:
df['subtask_a'].unique()

array(['OFF', 'NOT'], dtype=object)

In [53]:
df['subtask_a'].value_counts()

NOT    8841
OFF    4640
Name: subtask_a, dtype: int64

In [54]:
x_train = df['tweet']
y_train = df['subtask_a']

In [55]:
print(x_train.shape)
print(y_train.shape)

(13481,)
(13481,)


In [56]:
x = pd.read_csv('testset-levela.tsv', sep='\t')
x_test = x['tweet']

In [57]:
y = pd.read_csv('labels-levela.csv', sep='\t')
y_test = y['OFF']

In [58]:
print(x_test.shape)
print(y_test.shape)

(860,)
(860,)


In [59]:
text_clf = Pipeline([('n-gram', TfidfVectorizer(ngram_range=(1,2))), 
                     ('clf', LinearSVC())])

text_clf.fit(x_train, y_train)

Pipeline(steps=[('n-gram', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf', LinearSVC())])

In [60]:
y_pred = text_clf.predict(x_test)

In [61]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         NOT       1.00      0.89      0.94       620
         OFF       0.77      1.00      0.87       240

    accuracy                           0.92       860
   macro avg       0.89      0.94      0.91       860
weighted avg       0.94      0.92      0.92       860



In [62]:
print(metrics.accuracy_score(y_test, y_pred))

0.9174418604651163


In [63]:
offensive = ["This is rude.", "This message is offensive.", "This one is not OK.", "This message is not polite."]

neutral = ["This seems fine.", "This message is not offensive.", "This one is OK.", "This message pass"]

In [64]:
!pip install pysimplegui



In [65]:
import PySimpleGUI as sg
from PIL import Image

In [88]:
sg.theme('BlueMono')     
  
layout = [
    [sg.Text('Hi there!')],
    [sg.Text('My name is Polite!')],
    [sg.Text('And I am here to help you identify an offensive text.')],
    [sg.Image('polite.png')],
    [sg.Text('Please enter a message to check:')],
    [sg.Text(size =(20, 1)), sg.InputText()],
    [sg.Button("Check!"), sg.Cancel()]
]

window = sg.Window('Polite - The Offensive Language Checker', layout, margins=(20, 20), )
event, values = window.read()
window.close()
  
res = text_clf.predict(pd.Series(values))

if res == "OFF":
    sg.Window(title="Polite - The Offensive Language Checker", layout=[
    [sg.Text(random.choice(offensive)+'\n')]
    ], margins=(50, 50)).read()
else:
    sg.Window(title="Polite - The Offensive Language Checker", layout=[
    [sg.Text(random.choice(neutral)+'\n')]
    ], margins=(50, 50)).read()