In [1]:
%load_ext autoreload
%autoreload 2
from os import path
import pandas as pd
from tqdm import tqdm_notebook,tqdm
import numpy as np
import demoji

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,make_scorer, f1_score, accuracy_score, recall_score, precision_score, roc_auc_score,classification_report, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import random

In [3]:
def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    
    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='macro'))
    avg.append(accuracy_score(y_true, y_pred, normalize=True))
    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support','accuracy']
    list_all=list(metrics_summary)
    list_all.append(cm.diagonal())
    class_report_df = pd.DataFrame(
        list_all,
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-2] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [4]:
parent_path='Data/'

import json
with open(parent_path+'fear_speech_data.json', 'r') as fp:
    fear_speech_data=json.load(fp)

In [5]:
fear_speech_data['0']

{'message_text': '*प्रशासक समिति*✊🚩  ●●●●●●●●●●● ● ● ● 😎🚩 *आंतकवादी संगठनों का💣🔪 इस्लामिक नाम और उनका इस्लाम.....* *धर्म से जुड़ा हुआ अर्थ...* 🐖🐖🐖 *1.लश्करे तैयबा-फरिश्तो की सेना* *2.अल कायदा-अल्लाह का कायदा...* *3.जेश ए मोहम्द-मोहम्मद साहेब का दल...* *4.तहरिक ए तालिबान-पवित्र योद्धाओ का दल...* *5.हिजबुल मुजादिन-इस्लामी बलिदानियो का समूह...* *6.बोको हराम -पैगम्बर मुहम्मद की शिक्षा को फैलाने के लिए प्रतिबद्ध..* *सभी मुस्लमान अज्ञानि अल्लाह की बताई हुई रूहानी किताब क़ुरआन की बताई राह* *(पूरी दुनिया को इस्लाम बनाना)पर ही चल रहे है कोई ज्यादा बच👨\u200d👨\u200d👦* *पैदा करके तो कोई लव जिहाद👫 करके तो कोई काफ़िर(गेर मुसलमान)को मारकर..* *धरती पर आंतक🔫💣💣💣🔫फेला रहे है\ufeff।।* 😡😡😡😎😡😡😡  *जय सनातन धर्म की*🚩🚩🚩  🙏🚩🇮🇳🔱🏹🐚🕉',
 'translated_text': '* Administrator ✊ 🚩   Committee * ● ●●●●●●●●●● 🚩  😎  ● ● ● नाम 🔪  💣  * Islamic name of terrorist organizations and their meaning 🐖  🐖  🐖  in Islam… .. * Religion… * ... 1. * 1. Army of Lashkar-e-Taiba-Farishto * * 2. Al Qaeda-Qaeda of Allah ... * * 3. Team of Jesh-

### Doc2Vec


In [6]:
from utils.preprocess import *
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

list_sents = []
list_labels=[]
for key in tqdm_notebook(fear_speech_data.keys(),total=len(fear_speech_data)):
    element = fear_speech_data[key]
    
    count_fearspeech=element['annotation_list'].count('Fear speech')
    count_normal=element['annotation_list'].count('Normal')
    
    if(count_fearspeech>count_normal):
        one_fear_speech=1
    else:
        one_fear_speech=0
    
    text=preprocess_sent(element['message_text'],params={'remove_numbers': True, 'remove_emoji': True, 'remove_stop_words': False, 'tokenize': True})
    list_sents.append(text)
    list_labels.append(one_fear_speech)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4782.0), HTML(value='')))




In [7]:
X_0 = np.array(list_sents,dtype='object')
y_0 = np.array(list_labels)

In [8]:
def model_run(model_name='lr'):
    acc=[]
    macro_f1=[]
    prec=[]
    recall=[]
    prob=[]
    auc_roc=[]
    list_total_preds=[]
    list_total_truth=[]
    skf = StratifiedKFold(n_splits=5, random_state= 2020)

    for train_index, test_index in skf.split(X_0, y_0):
        print("TRAIN:", train_index[0:5], "TEST:", test_index[0:5])
        X_train, X_test = X_0[train_index], X_0[test_index]
        y_train, y_test = y_0[train_index], y_0[test_index]

        class_weights = dict(zip(np.unique(y_train), compute_class_weight("balanced", np.unique(y_train),y_train)))


        print(class_weights)
        ### Generate doc2vec vectors
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
        model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, workers=10)
        X_train_embed = np.array([list(model.infer_vector(ele)) for ele in X_train])
        X_test_embed = np.array([list(model.infer_vector(ele)) for ele in X_test])
        
        if(model_name=='lr'):
            classifier= LogisticRegression(class_weight='balanced',max_iter=500)
        
        elif(model_name=='svc'):
            classifier=SVC(class_weight='balanced',kernel='rbf',probability=True)
        
        classifier.fit(X_train_embed, y_train)
        y_pred=classifier.predict(X_test_embed)
        y_pred_proba = classifier.predict_proba(X_test_embed)
        acc.append(accuracy_score(y_test, y_pred))
        macro_f1.append(f1_score(y_test, y_pred, average='macro'))
        auc_roc.append(roc_auc_score(y_test, y_pred_proba[:,1],average='macro'))
        prec.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        prob.append(classifier.predict_proba(X_test_embed))
        list_total_preds+=list(y_pred)
        list_total_truth+=list(y_test)
    return acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall

### SVC


In [9]:
acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall=model_run(model_name='svc')
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc) * 2))
print("Macro F1: %0.2f (+/- %0.2f)" % (np.mean(macro_f1), np.std(macro_f1) * 2))
print("Auc Roc F1: %0.2f (+/- %0.2f)" % (np.mean(auc_roc), np.std(auc_roc) * 2))
print("Precision for +ve class: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec) * 2))
print("Recall for +ve class: %0.2f (+/- %0.2f)" % (np.mean(recall), np.std(recall) * 2))
print(pandas_classification_report(list_total_truth, list_total_preds))


TRAIN: [870 872 873 874 875] TEST: [0 1 2 3 4]
{0: 0.6567651098901099, 1: 2.0947426067907995}




TRAIN: [0 1 2 3 4] TEST: [870 872 873 874 875]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 1 2 3 4] TEST: [1771 1772 1773 1774 1775]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [2757 2758 2759 2761 2762]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [3792 3794 3796 3799 3801]
{0: 0.6569368131868132, 1: 2.0929978118161925}
Accuracy: 0.75 (+/- 0.07)
Macro F1: 0.68 (+/- 0.06)
Auc Roc F1: 0.77 (+/- 0.07)
Precision for +ve class: 0.50 (+/- 0.16)
Recall for +ve class: 0.60 (+/- 0.13)
             precision    recall  f1-score  support  accuracy
0             0.864512  0.801099  0.831598   3640.0  0.801099
1             0.486160  0.599825  0.537044   1142.0  0.599825
avg / total   0.675336  0.700462  0.684321   4782.0  0.753032



### Logistic Regression

In [10]:
acc, macro_f1, prec, prob,auc_roc,list_total_preds,list_total_truth,prec,recall=model_run(model_name='lr')

print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(acc), np.std(acc) * 2))
print("Macro F1: %0.2f (+/- %0.2f)" % (np.mean(macro_f1), np.std(macro_f1) * 2))
print("Auc Roc: %0.2f (+/- %0.2f)" % (np.mean(auc_roc), np.std(auc_roc) * 2))
print("Precision for +ve class: %0.2f (+/- %0.2f)" % (np.mean(prec), np.std(prec) * 2))
print("Recall for +ve class: %0.2f (+/- %0.2f)" % (np.mean(recall), np.std(recall) * 2))
print(pandas_classification_report(list_total_truth, list_total_preds))


TRAIN: [870 872 873 874 875] TEST: [0 1 2 3 4]
{0: 0.6567651098901099, 1: 2.0947426067907995}




TRAIN: [0 1 2 3 4] TEST: [870 872 873 874 875]
{0: 0.6567651098901099, 1: 2.0947426067907995}
TRAIN: [0 1 2 3 4] TEST: [1771 1772 1773 1774 1775]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [2757 2758 2759 2761 2762]
{0: 0.6569368131868132, 1: 2.0929978118161925}
TRAIN: [0 1 2 3 4] TEST: [3792 3794 3796 3799 3801]
{0: 0.6569368131868132, 1: 2.0929978118161925}
Accuracy: 0.73 (+/- 0.08)
Macro F1: 0.67 (+/- 0.07)
Auc Roc: 0.76 (+/- 0.08)
Precision for +ve class: 0.47 (+/- 0.15)
Recall for +ve class: 0.60 (+/- 0.14)
             precision    recall  f1-score  support  accuracy
0             0.861543  0.776099  0.816592   3640.0  0.776099
1             0.457751  0.602452  0.520227   1142.0  0.602452
avg / total   0.659647  0.689275  0.668409   4782.0  0.734630
