In [1]:
%load_ext autoreload
%autoreload 2
from os import path
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np


In [2]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

In [10]:
def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    
    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='macro'))
    avg.append(accuracy_score(y_true, y_pred, normalize=True))
    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support','accuracy']
    list_all=list(metrics_summary)
    list_all.append(cm.diagonal())
    class_report_df = pd.DataFrame(
        list_all,
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-2] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [4]:
parent_path='Data/'

import json
with open(parent_path+'fear_speech_data.json', encoding = 'utf-8') as fp:
    fear_speech_data=json.load(fp)

In [5]:
fear_speech_data['0']

{'message_text': '*‡§™‡•ç‡§∞‡§∂‡§æ‡§∏‡§ï ‡§∏‡§Æ‡§ø‡§§‡§ø*‚úäüö©  ‚óè‚óè‚óè‚óè‚óè‚óè‚óè‚óè‚óè‚óè‚óè ‚óè ‚óè ‚óè üòéüö© *‡§Ü‡§Ç‡§§‡§ï‡§µ‡§æ‡§¶‡•Ä ‡§∏‡§Ç‡§ó‡§†‡§®‡•ã‡§Ç ‡§ï‡§æüí£üî™ ‡§á‡§∏‡•ç‡§≤‡§æ‡§Æ‡§ø‡§ï ‡§®‡§æ‡§Æ ‡§î‡§∞ ‡§â‡§®‡§ï‡§æ ‡§á‡§∏‡•ç‡§≤‡§æ‡§Æ.....* *‡§ß‡§∞‡•ç‡§Æ ‡§∏‡•á ‡§ú‡•Å‡•ú‡§æ ‡§π‡•Å‡§Ü ‡§Ö‡§∞‡•ç‡§•...* üêñüêñüêñ *1.‡§≤‡§∂‡•ç‡§ï‡§∞‡•á ‡§§‡•à‡§Ø‡§¨‡§æ-‡§´‡§∞‡§ø‡§∂‡•ç‡§§‡•ã ‡§ï‡•Ä ‡§∏‡•á‡§®‡§æ* *2.‡§Ö‡§≤ ‡§ï‡§æ‡§Ø‡§¶‡§æ-‡§Ö‡§≤‡•ç‡§≤‡§æ‡§π ‡§ï‡§æ ‡§ï‡§æ‡§Ø‡§¶‡§æ...* *3.‡§ú‡•á‡§∂ ‡§è ‡§Æ‡•ã‡§π‡§Æ‡•ç‡§¶-‡§Æ‡•ã‡§π‡§Æ‡•ç‡§Æ‡§¶ ‡§∏‡§æ‡§π‡•á‡§¨ ‡§ï‡§æ ‡§¶‡§≤...* *4.‡§§‡§π‡§∞‡§ø‡§ï ‡§è ‡§§‡§æ‡§≤‡§ø‡§¨‡§æ‡§®-‡§™‡§µ‡§ø‡§§‡•ç‡§∞ ‡§Ø‡•ã‡§¶‡•ç‡§ß‡§æ‡§ì ‡§ï‡§æ ‡§¶‡§≤...* *5.‡§π‡§ø‡§ú‡§¨‡•Å‡§≤ ‡§Æ‡•Å‡§ú‡§æ‡§¶‡§ø‡§®-‡§á‡§∏‡•ç‡§≤‡§æ‡§Æ‡•Ä ‡§¨‡§≤‡§ø‡§¶‡§æ‡§®‡§ø‡§Ø‡•ã ‡§ï‡§æ ‡§∏‡§Æ‡•Ç‡§π...* *6.‡§¨‡•ã‡§ï‡•ã ‡§π‡§∞‡§æ‡§Æ -‡§™‡•à‡§ó‡§Æ‡•ç‡§¨‡§∞ ‡§Æ‡•Å‡§π‡§Æ‡•ç‡§Æ‡§¶ ‡§ï‡•Ä ‡§∂‡§ø‡§ï‡•ç‡§∑‡§æ ‡§ï‡•ã ‡§´‡•à‡§≤‡§æ‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§™‡•ç‡§∞‡§§‡§ø‡§¨‡§¶‡•ç‡§ß..* *‡§∏‡§≠‡•Ä ‡§Æ‡•Å‡§∏‡

### Doc2Vec


In [6]:
from utils.preprocess import *
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

list_sents = []
list_labels=[]
for key in tqdm_notebook(fear_speech_data.keys(),total=len(fear_speech_data)):
    element = fear_speech_data[key]
    
    count_fearspeech=element['annotation_list'].count('Fear speech')
    count_normal=element['annotation_list'].count('Normal')
    
    if(count_fearspeech>count_normal):
        one_fear_speech=1
    else:
        one_fear_speech=0
    
    text=preprocess_sent(element['message_text'],params={'remove_numbers': True, 'remove_emoji': True, 'remove_stop_words': False, 'tokenize': True})
    list_sents.append(text)
    list_labels.append(one_fear_speech)

['‡¶Ü‡¶Æ‡¶∞‡¶æ', 'poorly', 'an', 'ws', 'till', '‡¶π‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞', 'specified', '‡¶ï‡¶§', '‡¶ï‡¶∞‡¶æ‡¶Ø‡¶º', '‡¶ï‡ßÄ', '‡¶Ö‡¶®‡ßá‡¶ï', '‡¶Ö‡¶®‡ßç‡¶Ø', '‡§®', 'second', 'underneath', '‡¶ì‡¶∞', '‡¶è', '‡¶π‡¶Ø‡¶º‡ßá', 'besides', '‡§∏‡•ã', 'ht', '‡¶Ø‡¶æ‡¶Ø‡¶º', 'ie', 'kind', 'turns', 'someone', 'anyway', '‡¶®‡¶Ø‡¶º', 'and', 'needing', "who'll", '‡§ï‡§ø‡§∞', '‡§ï‡•ã‡§®‡§∏‡§æ', '‡¶ú‡¶®‡¶ï‡ßá', 'io', '‡§ï‡§∞‡§®‡•á', "it'll", 'primarily', 'hence', '‡¶è‡¶∞‡¶æ', 'click', '‡¶Æ‡¶æ‡¶§‡ßç‡¶∞', 'free', 'ask', '‡§®‡•á', '‡§™‡•Å‡§∞‡§æ', 'seem', '‡§¶‡§µ‡§æ‡§∞‡§æ', 'myself', 'points', '‡§¨‡§π‡•Å‡§§', 'especially', 'therefore', 'instead', 'viz', '‡¶∏‡ßá‡¶ñ‡¶æ‡¶®‡ßá', '‡§ú‡§¨', 'opening', 'thirty', '‡¶®‡ßá‡¶á', 'gov', 'given', "you've", '‡§ï‡•ã‡§à', '‡¶π‡¶≤', 'end', '‡¶∏‡¶π‡¶ø‡¶§', 'associated', '‡¶ï‡¶∞‡¶ø', '‡§¨‡§®‡§ø', 'neednt', '‡¶π‡¶Ø‡¶º‡¶§‡ßã', 'four', '‡¶è‡¶ñ‡¶®', 'rather', 'via', 'relatively', 'fill', "who'd", 'means', 'these', '‡§§‡§ø‡§®', '‡¶ú‡ßç‡¶®‡¶ú‡¶®', 'she', '‡§Ö‡§¶‡§ø', '‡§ú‡§π‡§æ‡§Å', '‡

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for key in tqdm_notebook(fear_speech_data.keys(),total=len(fear_speech_data)):


  0%|          | 0/4782 [00:00<?, ?it/s]

In [7]:
X_0 = np.array(list_sents,dtype='object')
y_0 = np.array(list_labels)

In [8]:
def model_run(model_name='lr'):
    acc=[]
    macro_f1=[]
    prec=[]
    recall=[]
    prob=[]
    auc_roc=[]
    list_total_preds=[]
    list_total_truth=[]
    skf = StratifiedKFold(n_splits=5, shuffle = True , random_state= 2020)

    for train_index, test_index in skf.split(X_0, y_0):
        print("TRAIN:", train_index[0:5], "TEST:", test_index[0:5])
        X_train, X_test = X_0[train_index], X_0[test_index]
        y_train, y_test = y_0[train_index], y_0[test_index]

        class_weights = dict(zip(np.unique(y_train), (np.sum(y_train.shape) / (len(np.unique(y_train)) * np.bincount(y_train)))))


        print(class_weights)
        ### Generate doc2vec vectors
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
        model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, workers=10)
        X_train_embed = np.array([list(model.infer_vector(ele)) for ele in X_train])
        X_test_embed = np.array([list(model.infer_vector(ele)) for ele in X_test])
        
        if(model_name=='lr'):
            classifier= LogisticRegression(class_weight='balanced',max_iter=500)
        
        elif(model_name=='svc'):
            classifier=SVC(class_weight='balanced',kernel='rbf',probability=True)
        
        classifier.fit(X_train_embed, y_train)
        y_pred=classifier.predict(X_test_embed)
        y_pred_proba = classifier.predict_proba(X_test_embed)
        acc.append(accuracy_score(y_test, y_pred))
        macro_f1.append(f1_score(y_test, y_pred, average='macro'))
        auc_roc.append(roc_auc_score(y_test, y_pred_proba[:,1],average='macro'))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        prob.append(classifier.predict_proba(X_test_embed))
        list_total_preds+=list(y_pred)
        list_total_truth+=list(y_test)
    return acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall

### SVC


In [11]:
acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall=model_run(model_name='svc')
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc) * 2))
print("Macro F1: %0.2f (+/- %0.2f)" % (np.mean(macro_f1), np.std(macro_f1) * 2))
print("Auc Roc F1: %0.2f (+/- %0.2f)" % (np.mean(auc_roc), np.std(auc_roc) * 2))
print("Precision for +ve class: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec) * 2))
print("Recall for +ve class: %0.2f (+/- %0.2f)" % (np.mean(recall), np.std(recall) * 2))
print(pandas_classification_report(list_total_truth, list_total_preds))


TRAIN: [0 1 2 3 4] TEST: [ 6 12 13 17 21]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 1 2 3 4] TEST: [11 14 15 20 26]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 2 3 4 5] TEST: [ 1 18 19 24 28]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [ 7  8 23 29 32]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [ 1  6  7  8 11] TEST: [0 2 3 4 5]
{0: 0.6569368131868132, 1: 2.0929978118161925}
             precision    recall  f1-score  support  accuracy
0             0.863002  0.849725  0.856312   3640.0  0.849725
1             0.543406  0.570053  0.556410   1142.0  0.570053
avg / total   0.703204  0.709889  0.706361   4782.0  0.782936



### Logistic Regression

In [None]:
acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall=model_run(model_name='lr')

print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc) * 2))
print("Macro F1: %0.2f (+/- %0.2f)" % (np.mean(macro_f1), np.std(macro_f1) * 2))
print("Auc Roc: %0.2f (+/- %0.2f)" % (np.mean(auc_roc), np.std(auc_roc) * 2))
print("Precision for +ve class: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec) * 2))
print("Recall for +ve class: %0.2f (+/- %0.2f)" % (np.mean(recall), np.std(recall) * 2))
print(pandas_classification_report(list_total_truth, list_total_preds))


TRAIN: [0 1 2 3 4] TEST: [ 6 12 13 17 21]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 1 2 3 4] TEST: [11 14 15 20 26]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 2 3 4 5] TEST: [ 1 18 19 24 28]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [ 7  8 23 29 32]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [ 1  6  7  8 11] TEST: [0 2 3 4 5]
{0: 0.6569368131868132, 1: 2.0929978118161925}
Accuracy: 0.74 (+/- 0.02)
Macro F1: 0.68 (+/- 0.03)
Auc Roc: 0.77 (+/- 0.02)
Precision for +ve class: 0.47 (+/- 0.04)
Recall for +ve class: 0.64 (+/- 0.08)
             precision    recall  f1-score  support  accuracy
0             0.872406  0.773901  0.820207   3640.0  0.773901
1             0.470058  0.639229  0.541744   1142.0  0.639229
avg / total   0.671232  0.706565  0.680975   4782.0  0.741740
