In [109]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
import string
import re
import nltk

In [107]:
df=pd.read_csv("data/labeled_data.csv")
df.drop(df.columns[[0]], axis=1,inplace=True)
print(df.iloc[:5,:])
print(df.shape)

   count  hate_speech  offensive_language  neither  class  \
0      3            0                   0        3      2   
1      3            0                   3        0      1   
2      3            0                   3        0      1   
3      3            0                   2        1      1   
4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
(24783, 6)


In [112]:
stopwords = nltk.corpus.stopwords.words("english")
txt=df.iloc[:,5]
tweet=txt.copy(deep=True)
#tweet=pd.DataFrame(tweet)
#transform to lower case
tweet=tweet.str.lower()
#remove punctuation
remove = str.maketrans('','',string.punctuation) 
tweet = tweet.str.translate(remove)
#word tokenize and remove stopwords
#remove digit and excessive whitespace
for i in range(tweet.shape[0]):
    text=tweet[i]
    text1=''.join([ch+" " for ch in text.split() if ch not in ' 123456789'])
    text2=''.join([word+" " for word in text1.split() if word not in stopwords])
    tweet[i]=text2
info=re.compile('[0-9|rt]')
tweet_liwc=tweet.apply(lambda x: info.sub('',x))
print(tweet[:5])

0    rt mayasolovely woman shouldnt complain cleani...
1    rt mleew17 boy dats coldtyga dwn bad cuffin da...
2    rt urkindofbrand dawg rt 80sbaby4life ever fuc...
3            rt cganderson vivabased look like tranny 
4    rt shenikaroberts shit hear might true might f...
Name: tweet, dtype: object


In [113]:
#LIWC
#run LIWC on the whole df
from liwc import LIWC
LIWC_list=[]
for i in range(len(tweet)):
    L = LIWC()
    length, matched, labels = L.classify(tweet_liwc[i].split())
    LIWC_list.append(L.readable(labels))
LIWC_list
#update preprocessed text to original df
df.tweet=tweet_liwc
hateSpeech=df[df['hate_speech'] ==3]["tweet"].reset_index(drop=True)
#it is the list of hate speech
LIWC_list1=[]
for i in range(len(hateSpeech)):
    L = LIWC()
    length, matched, labels = L.classify(hateSpeech[i].split())
    LIWC_list1.append(L.readable(labels))
LIWC_list1
#function that add dictionary
def mergeDict(dict1, dict2):
   ''' Merge dictionaries and keep values of common keys in list'''
   dict3 = {**dict1, **dict2}
   for key, value in dict3.items():
       if key in dict1 and key in dict2:
               dict3[key] = value +dict1[key]
   return dict3
#add those dic together
hate_dic=LIWC_list1[0]
for i in range(len(LIWC_list1)):
    hate_dic=mergeDict(hate_dic, LIWC_list1[i])
hate_dic.keys()
#find hate_relevent key by analyzing hate speech
relevent_key=[k for k, v in hate_dic.items() if v > 50]
#build dataframe with relevent columns
LIWC_df=pd.DataFrame(data=LIWC_list, index=None, columns=None, dtype=None, copy=False)[relevent_key]
LIWC_df.shape

(24783, 17)

In [89]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report

def evaluation(real_labels, pred_labels):
    f1_micro = f1_score(real_labels, pred_labels, average='micro')
    f1_macro = f1_score(real_labels, pred_labels, average='macro')
    f1_weighted = f1_score(real_labels, pred_labels, average='weighted')
    #f1_binary = f1_score(real_labels, pred_labels, average='binary')
    #f1_samples = f1_score(real_labels, pred_labels, average='samples')

    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(real_labels, pred_labels, average='micro')
    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(real_labels, pred_labels, average='macro')
    

    report = classification_report(real_labels, pred_labels)

    print('f1 micro: ',f1_micro)
    print('f1 macro: ',f1_macro)
    print('f1 weighted: ',f1_weighted)
    #print('f1 binary: ',f1_binary)
    #print('f1 samples: ',f1_samples)
    print('micro p, micro r, micro f1:', micro_p, micro_r, micro_f1)
    print('macro p, macro r, macro f1:', macro_p, macro_r, macro_f1)
    print(report)

## Load Data

In [31]:
X_ski = pd.read_csv('data/word2vec_skip2.csv',header=None)
X_lda = pd.read_csv('data/lda_infer.csv',header=None)
y = pd.read_csv('data/labeled_data.csv')['class']

In [139]:
temp = X_lda.copy()

In [140]:
temp = temp / temp.sum(axis=1)

In [144]:
temp = temp.iloc[:,0:3]

## Split Data 

In [40]:
X_all = pd.concat([X_ski, X_lda], axis=1)

In [41]:
X_ski_train, X_ski_test, y_train, y_test = train_test_split(X_ski, y, random_state=11, test_size=0.1)

In [42]:
X_all_train, X_all_test, y_train, y_test = train_test_split(X_all, y, random_state=11, test_size=0.1)

### Correlation

In [29]:
clf = LogisticRegression().fit(X_train, y_train)
y_hat = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)
accuracy

0.7882210568777733

In [30]:
clf = svm.SVC().fit(X_train, y_train)
y_hat = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)
accuracy

0.7894312222670432

# SVM with different features

Feature selection process based on Logistic Regression with L1-regularization as the estimator on the training data.

In [45]:
model = svm.LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_ski_train, y_train)

In [46]:
y_preds_temp = model.predict(X_ski_train)

In [47]:
report = classification_report( y_train, y_preds_temp )

In [48]:
print(report)

              precision    recall  f1-score   support

           0       0.27      0.10      0.15      1283
           1       0.93      0.90      0.91     17233
           2       0.63      0.85      0.72      3788

    accuracy                           0.84     22304
   macro avg       0.61      0.62      0.60     22304
weighted avg       0.84      0.84      0.84     22304



In [49]:
y_preds = model.predict(X_ski_test)
report = classification_report( y_test, y_preds)

In [50]:
print(report)

              precision    recall  f1-score   support

           0       0.28      0.11      0.16       147
           1       0.93      0.89      0.91      1957
           2       0.57      0.85      0.68       375

    accuracy                           0.83      2479
   macro avg       0.59      0.62      0.58      2479
weighted avg       0.84      0.83      0.83      2479



Implement the feature slection for ski-gram

Feature selection process based on Logistic Regression with L1-regularization as the estimator on the training data.

In [91]:
select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))
X_ski_ = select.fit_transform(X_ski,y)

In [92]:
X_ski_train_, X_ski_test_, y_train, y_test = train_test_split(X_ski_, y, random_state=11, test_size=0.1)

In [93]:
model = svm.LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_ski_train_, y_train)

In [94]:
y_preds = model.predict(X_ski_test_)
report = classification_report( y_test, y_preds)
print(report)

              precision    recall  f1-score   support

           0       0.28      0.10      0.14       147
           1       0.93      0.89      0.91      1957
           2       0.57      0.85      0.68       375

    accuracy                           0.84      2479
   macro avg       0.59      0.61      0.58      2479
weighted avg       0.84      0.84      0.83      2479



In [95]:
evaluation(y_preds, y_test)

f1 micro:  0.8350141185962081
f1 macro:  0.576629933999476
f1 weighted:  0.8413336163243282
micro p, micro r, micro f1: 0.8350141185962081 0.8350141185962081 0.8350141185962081
macro p, macro r, macro f1: 0.6104440388998386 0.5922471648110625 0.576629933999476
              precision    recall  f1-score   support

           0       0.10      0.28      0.14        50
           1       0.89      0.93      0.91      1867
           2       0.85      0.57      0.68       562

    accuracy                           0.84      2479
   macro avg       0.61      0.59      0.58      2479
weighted avg       0.86      0.84      0.84      2479



Using the whole Dataset to implement SVM

In [98]:
X_all_train, X_all_test, y_train, y_test = train_test_split(X_all, y, random_state=11, test_size=0.1)

In [99]:
model = svm.LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_all_train, y_train)

Should I use balanced?

In [100]:
y_preds = model.predict(X_all_test)
report = classification_report( y_test, y_preds,labels=[0, 1, 2])
print(report)

              precision    recall  f1-score   support

           0       0.30      0.14      0.19       147
           1       0.94      0.89      0.91      1957
           2       0.57      0.86      0.69       375

    accuracy                           0.84      2479
   macro avg       0.60      0.63      0.60      2479
weighted avg       0.85      0.84      0.84      2479



In [101]:
evaluation(y_preds, y_test)

f1 micro:  0.8386446147640177
f1 macro:  0.5963232281871792
f1 weighted:  0.8418861289694988
micro p, micro r, micro f1: 0.8386446147640177 0.8386446147640177 0.8386446147640177
macro p, macro r, macro f1: 0.6288718282066701 0.6039427872152546 0.5963232281871792
              precision    recall  f1-score   support

           0       0.14      0.30      0.19        67
           1       0.89      0.94      0.91      1848
           2       0.86      0.57      0.69       564

    accuracy                           0.84      2479
   macro avg       0.63      0.60      0.60      2479
weighted avg       0.86      0.84      0.84      2479



In [147]:
X_temp = pd.concat([X_ski, temp], axis=1)

In [None]:
X_temp_ = select.fit_transform(X_temp,y)

In [154]:
X_temp_.shape

(24783, 47)

In [151]:
X_temp_train, X_temp_test, y_train, y_test = train_test_split(X_temp_, y, random_state=11, test_size=0.1)

In [152]:
model = svm.LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_temp_train, y_train)

In [155]:
y_preds = model.predict(X_temp_test)
report = classification_report( y_test, y_preds)
print(report)

              precision    recall  f1-score   support

           0       0.30      0.10      0.15       147
           1       0.93      0.89      0.91      1957
           2       0.57      0.85      0.68       375

    accuracy                           0.84      2479
   macro avg       0.60      0.61      0.58      2479
weighted avg       0.84      0.84      0.83      2479



In [156]:
evaluation(y_preds, y_test)

f1 micro:  0.836224283985478
f1 macro:  0.5803120595868753
f1 weighted:  0.8424709039790532
micro p, micro r, micro f1: 0.836224283985478 0.836224283985478 0.836224283985478
macro p, macro r, macro f1: 0.6123337099104673 0.5993667071642448 0.5803120595868753
              precision    recall  f1-score   support

           0       0.10      0.30      0.15        50
           1       0.89      0.93      0.91      1870
           2       0.85      0.57      0.68       559

    accuracy                           0.84      2479
   macro avg       0.61      0.60      0.58      2479
weighted avg       0.86      0.84      0.84      2479



## Logistic Regression

### All Features

In [5]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [6]:
y_preds = grid_result.predict(X_test)

In [7]:
report = classification_report(y_test, y_preds)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.03      0.07       147
           1       0.80      0.97      0.88      1957
           2       0.36      0.09      0.15       375

    accuracy                           0.78      2479
   macro avg       0.66      0.37      0.36      2479
weighted avg       0.74      0.78      0.72      2479



### Correlation

In [25]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_f, y_train)

In [26]:
y_preds = grid_result.predict(X_test_f)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.80      0.03      0.05       147
           1       0.80      0.98      0.88      1957
           2       0.36      0.09      0.14       375

    accuracy                           0.79      2479
   macro avg       0.65      0.36      0.36      2479
weighted avg       0.74      0.79      0.72      2479



## SVM

In [27]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
# define models and parameters
model = svm.SVC()

# define grid search
#grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
y_preds = grid_result.predict(X_test_f)
report = classification_report( y_test, y_preds )
print(report)

KeyboardInterrupt: 


## Imbalanced: Class Weight/resample

In [6]:
weight = []
for i in np.unique(y):
    weight.append(len(y[y==i])/len(y))

In [35]:
weights = {0:1/weight[0], 1:1/weight[1], 2:1/weight[2]}

{0: 17.33076923076923, 1: 1.2914538822303283, 2: 5.9531587797261585}

In [23]:
# Divide by class
class_0 = X_train[y_train == 0]
class_1 = X_train[y_train == 1]
class_2 = X_train[y_train == 2]

class_1_over = class_1.sample(len(y_train[y_train==0]),replace=True)
class_2_over = class_2.sample(len(y_train[y_train==0]),replace=True)
#class_0_over.shape, class_1.shape, class_2_over.shape
X_over = pd.concat([class_0, class_1_over, class_2_over], axis=0)
class_0.shape

q = np.array([0,1,2])
y_over = np.repeat(q, 1283)

In [24]:
clf = LogisticRegression().fit(X_over, y_over)
y_hat = clf.predict(X_test)
accuracy = clf.score(X_test, y_test)
accuracy

0.5292456635740218

In [18]:
x = np.array([0,1,2])
y_over = np.repeat(x, 17233)

array([0, 0, 0, ..., 2, 2, 2])