# Imports

In [1]:
from Twitter import TwitterAccess
import pandas as pd
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from collections import Counter
pd.options.display.max_colwidth = None
import nltk
import string
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import random
import numpy as np
from multiprocessing import  Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin # ???
from sklearn.utils.validation import check_X_y, check_is_fitted # ???
from scipy import sparse #???
from sklearn.linear_model import LogisticRegression



[nltk_data] Downloading package stopwords to /u/arsaikia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Architecture

In [2]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    
    def fit(self, x, y):
        # Check that X and y have correct shape
#         y = y
        x, y = check_X_y(x, y, accept_sparse=True)
        self._r = sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs, solver ='liblinear').fit(x_nb, y)
        self.coef_ = self._clf.coef_
        return self

# Train-Validate

In [9]:

def train_validate(X, y, min_df):
    le = LabelEncoder()
    y = le.fit_transform(y)

    tfv = TfidfVectorizer(tokenizer=nltk.casual_tokenize, min_df=min_df,  max_features=30000, 
                strip_accents='unicode', analyzer='word',ngram_range=(1,1),
                use_idf=1,smooth_idf=1,sublinear_tf=1,
                stop_words = 'english')
    X = tfv.fit_transform(X).tocsr()

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)
    print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

    model = NbSvmClassifier(C=4, dual=True, n_jobs=1)

    model.fit(X_train, y_train)
    preds_valid = model.predict(X_valid)
    proba_valid = model.predict_proba(X_valid)[:,1]
    preds_train = model.predict(X_train)
    proba_train = model.predict_proba(X_train)[:,1]


    print(classification_report(y_train, preds_train))
    try:
        print('AUC: ',  roc_auc_score(y_train, proba_train))
    except:
        print('AUC: ',  roc_auc_score(y_train, model.predict_proba(X_train), multi_class ='ovr'))
        
    print(classification_report(y_valid, preds_valid))
    try:
        print('AUC: ',  roc_auc_score(y_valid, proba_valid))
    except:
        print('AUC: ',  roc_auc_score(y_valid, model.predict_proba(X_valid), multi_class ='ovr'))



# Test

In [10]:
def test(X, y, X_test,y_test,min_df):
    le = LabelEncoder()
    y = le.fit_transform(y)

    tfv = TfidfVectorizer(tokenizer=nltk.casual_tokenize, min_df=min_df,  max_features=30000, 
                strip_accents='unicode', analyzer='word',ngram_range=(1,1),
                use_idf=1,smooth_idf=1,sublinear_tf=1,
                stop_words = 'english')
    X = tfv.fit_transform(X).tocsr()

    model = NbSvmClassifier(C=4, dual=True, n_jobs=1)

    model.fit(X, y)

    y_test = le.transform(y_test)
    X_test = tfv.transform(X_test).tocsr()

    print(X_test.shape, y_test.shape)
    preds_test = model.predict(X_test)
    proba_test = model.predict_proba(X_test)[:,1]


    print(classification_report(y_test, preds_test))
    try:
        print('AUC: ',  roc_auc_score(y_test, proba_test))
    except:
        print('AUC: ',  roc_auc_score(y_test, model.predict_proba(X_test), multi_class ='ovr'))




# Load Data

In [11]:
PREPROCESSED = './preprocessed/'

olid = pd.read_csv(PREPROCESSED + 'olid.csv')
olid_traina = olid[~pd.isna(olid['tweet_cleaned'])]
olid_trainb = olid_traina[~pd.isna(olid_traina['subtask_b'])]
olid_trainc = olid_traina[~pd.isna(olid_traina['subtask_c'])]


olid_testa = pd.read_csv(PREPROCESSED + 'olid-levela.csv')
olid_testa = olid_testa[~pd.isna(olid_testa['tweet_cleaned'])]

olid_testb = pd.read_csv(PREPROCESSED + 'olid-levelb.csv')
olid_testb = olid_testb[~pd.isna(olid_testb['tweet_cleaned'])]

olid_testc = pd.read_csv(PREPROCESSED + 'olid-levelc.csv')
olid_testc = olid_testc[~pd.isna(olid_testc['tweet_cleaned'])]

# Subtask A

In [12]:
train_validate(olid_traina['tweet_cleaned'], olid_traina['subtask_a'], 20)
test(olid_traina['tweet_cleaned'], olid_traina['subtask_a'], olid_testa['tweet_cleaned'], olid_testa['subtask_a'], 20)

(8870, 1084) (4369, 1084) (8870,) (4369,)
              precision    recall  f1-score   support

           0       0.79      0.94      0.86      5922
           1       0.80      0.51      0.63      2948

    accuracy                           0.80      8870
   macro avg       0.80      0.73      0.74      8870
weighted avg       0.80      0.80      0.78      8870

AUC:  0.8442547096881807
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      2917
           1       0.73      0.45      0.56      1452

    accuracy                           0.76      4369
   macro avg       0.75      0.68      0.70      4369
weighted avg       0.76      0.76      0.74      4369

AUC:  0.7621498511150084
(859, 1084) (859,)
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       619
           1       0.78      0.47      0.58       240

    accuracy                           0.81       859
   macro avg     

# Subtask B

In [13]:
train_validate(olid_trainb['tweet_cleaned'], olid_trainb['subtask_b'],5)
test(olid_trainb['tweet_cleaned'], olid_trainb['subtask_b'], olid_testb['tweet_cleaned'], olid_testb['subtask_b'],5)

(2948, 1718) (1452, 1718) (2948,) (1452,)
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2599
           1       0.89      0.21      0.33       349

    accuracy                           0.90      2948
   macro avg       0.90      0.60      0.64      2948
weighted avg       0.90      0.90      0.88      2948

AUC:  0.9274379279665642
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1277
           1       0.34      0.07      0.11       175

    accuracy                           0.87      1452
   macro avg       0.61      0.53      0.52      1452
weighted avg       0.82      0.87      0.83      1452

AUC:  0.6219510012305627
(240, 1718) (240,)
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       213
           1       0.50      0.11      0.18        27

    accuracy                           0.89       240
   macro avg     

# Subtask C

In [14]:
train_validate(olid_trainc['tweet_cleaned'], olid_trainc['subtask_c'],5)
test(olid_trainc['tweet_cleaned'], olid_trainc['subtask_c'], olid_testc['tweet_cleaned'], olid_testc['subtask_c'],5)

(2596, 1580) (1280, 1580) (2596,) (1280,)
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       735
           1       0.80      0.95      0.87      1600
           2       0.88      0.13      0.23       261

    accuracy                           0.80      2596
   macro avg       0.82      0.60      0.62      2596
weighted avg       0.81      0.80      0.77      2596

AUC:  0.9219482342654777
              precision    recall  f1-score   support

           0       0.53      0.47      0.50       339
           1       0.72      0.87      0.79       807
           2       0.20      0.01      0.03       134

    accuracy                           0.67      1280
   macro avg       0.48      0.45      0.44      1280
weighted avg       0.62      0.67      0.63      1280

AUC:  0.7012538330907798
(213, 1580) (213,)
              precision    recall  f1-score   support

           0       0.67      0.46      0.55        78
           1      

In [None]:
def top_lr_feats(features,row, top_n=25, pos = True):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    if pos == True:
        topn_ids = np.argsort(row)[::-1][:top_n]
    else:
        topn_ids = np.argsort(row)[:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    return df.iloc[:,0].values, df.iloc[:,1].values


fig,(ax1,ax2)=plt.subplots(2,1,figsize=(15,7))

features = np.array(tfv.get_feature_names())
x, y = top_lr_feats(features,  model.coef_[0],top_n = 20, pos = True)
sns.barplot(x, y, ax = ax1)
ax1.set_title('Class : Offensive tweets (OFF)')
x, y = top_lr_feats(features, model.coef_[0], top_n = 20, pos = False)
sns.barplot(x, y, ax = ax2)
ax2.set_title('Class : Not offensive tweets  (NOT)')

fig.text(0.5, 0.04, 'unigrams', ha='center')
fig.text(0.05, 0.5, 'coeff', va='center',  rotation='vertical')
fig.suptitle('Feature Importances', fontsize=16)
# sns.barplot(features,model.coef_[0])

In [37]:
olid_trainc

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,tweet_cleaned
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL,OFF,TIN,IND,go home you’re drunk
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH,liberals are all kookoo
7,52415,@USER was literally just talking about this lol all mass shootings like that have been set ups. it’s propaganda used to divide us on major issues like gun control and terrorism,OFF,TIN,GRP,was literally just talking about this lol all mass shootings like that have been set ups it’s propaganda used to divide us on major issues like gun control and terrorism
9,13384,@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo,OFF,TIN,IND,canada doesn’t need another cuck we already have enough fking up our great country
12,28414,@USER you are a lying corrupt traitor!!! Nobody wants to hear anymore of your lies!!! #DeepStateCorruption URL,OFF,TIN,IND,you are a lying corrupt traitor nobody wants to hear anymore of your lies
...,...,...,...,...,...,...
13212,93164,"@USER @USER Everything else was ten years ago. "" YOU ARE A LIAR, this is total BS propaganda from team Trump his crimes were throughout the past decade, including up to 2016""",OFF,TIN,IND,everything else was ten years ago you are a liar this is total bs propaganda from team trump his crimes were throughout the past decade including up to 2016
13223,63482,@USER is advocating for conduct within bounds of Human Rights but can the terrorists can be categorized as Human? They kill people mostly innocent just like berserk wild beasts. Even wild beasts kill only when hungry. So I feel that the Indian Army who are doing greatly. URL,OFF,TIN,GRP,is advocating for conduct within bounds of human rights but can the terrorists can be categorized as human they kill people mostly innocent just like berserk wild beasts even wild beasts kill only when hungry so i feel that the indian army who are doing greatly
13227,87416,@USER @USER @USER @USER Liars like the Antifa twins you vigorously defend?,OFF,TIN,GRP,liars like the antifa twins you vigorously defend
13235,95338,@USER Sometimes I get strong vibes from people and this man’s vibe is tens of millions of murders - he is more dangerous than DT.,OFF,TIN,IND,sometimes i get strong vibes from people and this man’s vibe is tens of millions of murders he is more dangerous than dt
