In [19]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# import data
df = pd.read_csv('2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv')
print(df.shape)
df.head()

(40623, 11)


Unnamed: 0.1,Unnamed: 0,id,text,label,type,model_wrong,db.model_preds,status,round,split,annotator
0,1,10000,It should be legal for a husband to rape his wife,hate,notgiven,True,0.97457,dynabench entry,1,train,gnzuctwed3
1,2,10001,dalits are lowlife scum,hate,notgiven,False,0.08233,dynabench entry,1,test,cagnluiznm
2,3,10002,dalits are lowlives,hate,notgiven,True,0.92319,dynabench entry,1,train,cagnluiznm
3,4,10003,It was a better world when women dare not ques...,hate,notgiven,True,0.99006,dynabench entry,1,test,gnzuctwed3
4,5,10004,I don't work this hard so that those immigrant...,hate,notgiven,True,0.98836,dynabench entry,1,train,cagnluiznm


In [4]:
df = df.dropna()

In [5]:
# hate speech detected scenario 1:
# label == hate AND model_wrong == False
cond1a = df['label'] == 'hate'
cond1b = df['model_wrong'] == False

# hate speech detected scenario 2:
# label == nothate AND model_wrong == True
cond2a = df['label'] == 'nothate'
cond2b = df['model_wrong'] == True

In [6]:
# add a new column to simplify dataset:
# new column = "hate_detected"
# will be true if:
# label = 'hate' AND model_wrong = false OR
# label = 'nothate' AND model_wrong = true

df['AL_detected'] = np.where((cond1a & cond1b)|(cond2a & cond2b), True, False)

In [8]:
df.columns
df = df.drop(columns=['Unnamed: 0', 'id', 'type', 'model_wrong',
       'db.model_preds', 'status', 'round', 'split', 'annotator','label'])

In [9]:
# lowercase all words:
def make_lower(a_string):
    return a_string.lower()

In [12]:
# lower case only, leave punc in, don't stem words

df['text_clean'] = df['text'].apply(make_lower)

In [14]:
# define variables:

X = df['text_clean'].values

y = df['AL_detected'].values

In [15]:
# using n_grams

vectorizer_ng = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer_ng.fit_transform(X)

In [16]:
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42)

model = svm.SVC()

model.fit(X_train, y_train)

SVC()

In [21]:
# manual prediction

sample_text = "Truly weird Senator Rand Paul of Kentucky reminds me of a spoiled brat without a properly functioning brain. He was terrible at DEBATE!"
sample_text_clean = make_lower(sample_text)

print('Original:', sample_text, '\n')
print('Cleaned: ', sample_text_clean, '\n')

Original: Truly weird Senator Rand Paul of Kentucky reminds me of a spoiled brat without a properly functioning brain. He was terrible at DEBATE! 

Cleaned:  truly weird senator rand paul of kentucky reminds me of a spoiled brat without a properly functioning brain. he was terrible at debate! 



In [23]:
X_new = vectorizer_ng.transform([sample_text_clean])
y_pred_new = model.predict(X_new)

# Print the prediction
print('Predicted Category:', y_pred_new)

Predicted Category: [False]


In [20]:
# NAME YOUR MODEL 
filename = 'svm_model.pkl'

# EXPORT AND SAVE YOUR MODEL USING YOUR FILENAME
pickle.dump(model, open(filename, 'wb'))