# Task -1: Classifying Text into Counter and Non Counter Class

### This notebook is used to measure performance of the combination of different classifier and different feature engineering techniques  used in the project

In [4]:
from sklearn import *
from sklearn.model_selection import *
from sklearn.metrics import *

import pandas as pd
import numpy as np
import json
from collections import Counter
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt
import pandas as pd
import itertools
from string import punctuation
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import *
ps = PorterStemmer()
from scipy.sparse import vstack, hstack
import pickle
import sys
sys.path.insert(0, '../../')
####features module has the necessary function for feature generation 
from utils.features import *
###tokenize module has the tokenization funciton
from utils.tokenize import *
###helper prints confusion  matrix and stores results
from utils.helper import *
###common preprocessing imports
from utils.commen_preprocess import *

####gensim load 
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import os
import numpy as np


####ipywidgets 

In [5]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

### Word_to_vec model  loading 
1. change the path of glove model file


In [6]:
GLOVE_MODEL_FILE="../../../embeddings/glove.840B.300d.txt"
print(os.path.isfile(GLOVE_MODEL_FILE))

## change the embedding dimension according to the model
EMBEDDING_DIM = 300
def loadGloveModel2(glove_file):
    tmp_file = get_tmpfile("test_crawl_200.txt")

    # call glove2word2vec script
    # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>

    glove2word2vec(glove_file, tmp_file)
    model=KeyedVectors.load_word2vec_format(tmp_file)
    return model

word2vec_model = loadGloveModel2(GLOVE_MODEL_FILE)

True


## Dataset is loaded here

In [7]:
### change the path where the data is kept
path='../../Data/Counterspeech_Dataset.json'
with open(path) as fp:
    train_data = json.load(fp)

In [8]:
def convert_class_label(input_text):
    if input_text:
        return 'counter'
    else:
        return 'noncounter'

In [9]:
pd_train = pd.DataFrame(columns=['id','class','community','category','text'])

for count, each in enumerate(train_data):
    try:
        pd_train.loc[count]  = [each['id'], convert_class_label(each['CounterSpeech']), each['Community'],each['Category'],each['commentText']]
    except:
        pass
print('Training Data Loading Completed...')

Training Data Loading Completed...


In [10]:
pd_train['text'].replace('', np.nan, inplace=True)
pd_train.dropna(subset=['text'], inplace=True)

In [11]:
#### converting the data into text and labels dictionary
def get_data(pd_train):
    comments=pd_train['text'].values
    labels=pd_train['class'].values
    list_comment=[]
    for comment,label in zip(comments,labels):
        temp={}
        temp['text']=comment
        temp['label']=label
        list_comment.append(temp)
    return list_comment    

## Model selection and feature selection 

In [12]:
def get_model(m_type=None):
    if not m_type:
        print("ERROR: Please specify a model type!")
        return None
    if m_type == 'decision_tree_classifier':
        logreg = tree.DecisionTreeClassifier(class_weight='balanced')
    elif m_type == 'MLPClassifier':
        logreg = neural_network.MLPClassifier((500))
    elif m_type == 'KNeighborsClassifier':
        logreg = neighbors.KNeighborsClassifier(n_neighbors = 10)
    elif m_type == 'ExtraTreeClassifier':
        logreg = tree.ExtraTreeClassifier()
    elif m_type == 'ExtraTreeClassifier_2':
        logreg = ensemble.ExtraTreesClassifier()
    elif m_type == 'RandomForestClassifier':
        logreg = ensemble.RandomForestClassifier(class_weight='balanced')
    elif m_type == 'Logistic_Regression':
        logreg = linear_model.LogisticRegression(class_weight='balanced')
    elif m_type == 'SVC':
        logreg = SVC(class_weight='balanced');
    elif m_type == 'Catboost':
        logreg = CatBoostClassifier(iterations=100,scale_pos_weight=(4048/5335))
    elif m_type == 'XGB_classifier':
        logreg=XGBClassifier(scale_pos_weight=(4048/5335),n_estimators=500,nthread=12)
    elif m_type == 'Gaussian_NB':
        logreg = GaussianNB()
    else:
        print("give correct model")
        
    return logreg

def get_feature(pd_train,f_type=None):
    if not f_type:
        print("ERROR: Please specify a model type!")
        return None,None
    if f_type == 'google_not_preprocess':
        X,y=gen_data_google2(pd_train)
    elif f_type == 'word_to_vec_embed':
        X,y=gen_data_embed(pd_train,word2vec_model)
    elif f_type == 'google_preprocess':
        X,y=gen_data_google(pd_train)
    elif f_type == 'tfidf_not_preprocess':
        X,y=gen_data_new_tfidf2(pd_train)
    elif f_type == 'tfidf_preprocess':
        X,y=gen_data_new_tfidf(pd_train)
    elif f_type == 'google_preprocess_tfidf_preprocess':
        X,y=combine_tf_google_rem(pd_train)
    elif f_type == 'google_nopreprocess_tfidf_nopreprocess':
        X,y=combine_tf_google_norem(pd_train)
    elif f_type == 'google_preprocess_tfidf_nopreprocess':
        X,y=combine_tf_norem_google_rem(pd_train)
    elif f_type == 'google_nopreprocess_tfidf_preprocess':
        X,y=combine_tf_rem_google_norimportem(pd_train)
    elif f_type == 'google_preprocess_embed':
        X,y=combine_google_rem_embed(pd_train,word2vec_model)
    elif f_type == 'tfidf_preprocess_embed':
        X,y=combine_tf_rem_embed(pd_train,word2vec_model)
    elif f_type == 'google_preprocess_tfidf_preprocess_embed':
        ###best features####
        X,y=combine_tf_rem_google_rem_embed(pd_train,word2vec_model)
    else:
        print("give correct feature selection")    
    return X,y




#### get all the parameters

In [13]:
import json
with open('binary_all_parameters.json') as f:
        parameters=json.load(f)


In [14]:
def classification_model(pd_train,classifier_model,feature_model,img_name,report_name):
    X,y=get_feature(pd_train,f_type=feature_model)
    model=get_model(m_type=classifier_model)
    if(model==None):
        return 1
    try:
        model_parameter=parameters[classifier_model+'+'+feature_model]
        for k,v in param_set.items():
            setattr(model_parameter,k,v)
    except:
        pass
    
    Classifier_Train_X = np.array(X, copy=False)
    Classifier_Train_Y = y
    label_map = {
            'counter': 0,
            'noncounter': 1
        }
    temp=[]
    for data in Classifier_Train_Y:
            temp.append(label_map[data])

    Classifier_Train_Y=np.array(temp)
    kf = StratifiedKFold(n_splits=10)
    y_total_preds=[] 
    y_total=[]
    count=0

    for train_index, test_index in kf.split(Classifier_Train_X,Classifier_Train_Y):
        print('cv_fold',count)
        X_train, X_test = Classifier_Train_X[train_index], Classifier_Train_X[test_index]
        y_train, y_test = Classifier_Train_Y[train_index], Classifier_Train_Y[test_index]
        classifier=model 
        classifier.fit(X_train,y_train)
        y_preds = classifier.predict(X_test)
        for ele in y_test:
            y_total.append(ele)
        for ele in y_preds:
            y_total_preds.append(ele)
        y_pred_train = classifier.predict(X_train)
        count=count+1       
        print('accuracy_train:',accuracy_score(y_train, y_pred_train),'accuracy_test:',accuracy_score(y_test, y_preds))

    report = classification_report( y_total, y_total_preds )
    cm=confusion_matrix(y_total, y_total_preds)
    plt=plot_confusion_matrix(cm,normalize= True,target_names = ['counter','non_counter'],title = "Confusion Matrix")
    plt.savefig(img_name)
    print(report)
    df_result=pandas_classification_report(y_total,y_total_preds)
    df_result.to_csv(report_name,  sep=',')
    with open('all_preds_binary.pkl', 'wb') as f:
          pickle.dump([y_total,y_total_preds], f)



## Models available
1. decision_tree_classifier
2. MLPClassifier
3. KNeighborsClassifier
4. ExtraTreeClassifier
5. ExtraTreeClassifier_2
6. RandomForestClassifier
7. SVC
8. Catboost
9. XGB_classifier
10. Logistic Regression
11. Gaussian Naive bayes

## Feature Models available
1. google_not_preprocess
2. word_to_vec_embed
3. google_preprocess
4. tfidf_not_preprocess
5. tfidf_preprocess
6. google_preprocess_tfidf_preprocess
7. google_nopreprocess_tfidf_nopreprocess
8. google_preprocess_tfidf_nopreprocess
9. google_nopreprocess_tfidf_preprocess
10. google_preprocess_embed
11. tfidf_preprocess_embed
12. google_preprocess_tfidf_preprocess_embed


### Specify the model and the feature selection method

In [15]:
options_ft=['google_not_preprocess','word_to_vec_embed','google_preprocess','tfidf_not_preprocess','tfidf_preprocess','google_preprocess_tfidf_preprocess','google_nopreprocess_tfidf_nopreprocess','google_preprocess_tfidf_nopreprocess', 'google_nopreprocess_tfidf_preprocess','google_preprocess_embed','tfidf_preprocess_embed','google_preprocess_tfidf_preprocess_embed']
ft= widgets.Dropdown(options=options_ft, value=None)
print('select feature combination') 
ft

select feature combination


In [16]:
options_clf=['decision_tree_classifier','MLPClassifier','KNeighborsClassifier','ExtraTreeClassifier','ExtraTreeClassifier_2','RandomForestClassifier','SVC','Catboost','XGB_classifier','Logistic_Regression','Gaussian Naive bayes']
clf= widgets.Dropdown(options=options_clf, value=None)
print('select a model') 
clf

select a model


In [None]:
###specify the model name
clf_model=clf.value
###specify the feature model###
ft_model=ft.value
###image_name###
im=clf_model+ft_model+'_cm.png'
###report_name###
re=clf_model+ft_model+'_report.csv'

In [None]:
###actual classfier call
classification_model(pd_train,classifier_model=clf_model,feature_model=ft_model,img_name=im,report_name=re)

ending


  'stop_words.' % sorted(inconsistent))
