There is a change in preprocessing in this notebook, we removed all the numbers from tweets, which helped in training more robust word2vec embeddings

Analysis on scraped dataset

### Import

In [None]:
import pandas as pd
import numpy as np

In [None]:
import pickle
import sys
import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cf_data_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NAACL_SRW.csv',encoding = "ISO-8859-1")
cf_data_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NLP+CSS.csv')

cf_data_3 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_2/labeled_data.csv',encoding = "ISO-8859-1")
## this is the scraped data

cf_data_3.rename({'Unnamed: 0':'ID','tweet':'Tweets'},axis=1,inplace=True)

labels_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NAACL_SRW_2016.csv',header=None,names=['ID','class'])
labels_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NLP+CSS_2016.csv',sep='\s')

labels_2.rename({'TweetID':'ID','Expert':'class'},axis=1,inplace=True)

cf_data_1.rename({'Unnamed: 0':'index_col'},axis=1,inplace=True)
cf_data_2.rename({'Unnamed: 0':'index_col'},axis=1,inplace=True)

In [None]:
real_world_test = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/realDonaldTrump_tweets_extended.csv')

In [None]:
real_world_test_racist = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/racist_scarped_extended.csv', names=['timestamp','text'])

In [None]:
pok = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/pok.csv', names=['timestamp','text'])

In [None]:
pok = pok.drop_duplicates(subset='text')

In [None]:
jigsaw = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/jigsaw-toxic-comment-classification-challenge/train.csv')
jigsaw.rename(columns = {'comment_text':'text'},inplace=True)

### Function for merging

In [None]:
def label_merging(data, labels):
    labels['ID'] = labels['ID'].astype(int)
    print(labels['ID'].nunique())
    print('Null IDs in data 1 = ' ,data['ID'].isna().sum())
    
    data['ID'].fillna(0,inplace=True)
    data['ID'] = data['ID'].astype(int)
    
    print('data shape ='  ,data.shape)
    print('IDs common in data and labels =',sum(data['ID'].isin(labels['ID'])))
    
    train = data.merge(labels, on='ID',how='inner')#['class'].isna().sum()
    return train

In [None]:
train_1 = label_merging(cf_data_1,labels_1)

train_2 = label_merging(cf_data_2, labels_2)

train_3 = cf_data_3.copy()

In [None]:
t1 = train_1[['ID','Tweets','class']]
t2 = train_2[['ID','Tweets','class']]
t3 = train_3[['ID','Tweets','class']]
merged = pd.concat([t1,t2,t3],axis=0).reset_index(drop=True)

### Basic Preprocessing

In [None]:
train = merged.copy()

train.rename(columns={'Tweets':'tweet'},inplace=True)
train['tweet'] = train['tweet'].astype(str)

In [None]:
# train = train.drop_duplicates(subset='ID')
# train.to_csv('train.csv',index=False, header=True)

In [None]:
def clean_remove_b(data):  
  data.rename(columns={'text':'tweet'},inplace=True)
  data['tweet'] = data['tweet'].astype(str)

  data['tweet'] = data['tweet'].apply(lambda x:x[2:] if x[0:2]=="b'" or 'b"' else x)

In [None]:
clean_remove_b(real_world_test)
clean_remove_b(real_world_test_racist)
clean_remove_b(pok)
clean_remove_b(jigsaw)

Every word followed by @ is some twitter ID of an user, which shouldn't be considered in our analysis, so lets do the stemming, where we remove @ alonwith the word followed by it

In [None]:
def preprocess(data):
  data['tweet'] = data['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('@')==-1]))
  data['tweet'] = data['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('http')==-1]))
  
  ## we are removing hashtags now, but while doing transfer learning, to learn the embeddings we didnt remove these, 
  ## just to include such words in our vocabulary
  
  data['tweet'] = data['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('#')==-1]))
  data['tweet'] = data['tweet'].apply(lambda x:''.join([i for i in x if not i.isdigit()]))
  data['tweet'] = data['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
  data['tweet'] = data['tweet'].str.replace('[^\w\s]','')

  import nltk
  nltk.download('stopwords')

  from nltk.corpus import stopwords
  stop = stopwords.words('english')
  data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

  remove_word = ['rt','mkr','im']
  data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_word))

In [None]:
preprocess(train)
preprocess(real_world_test)
preprocess(real_world_test_racist)
preprocess(pok)
preprocess(jigsaw)

Doesnt really make sense to remove rare words, i.e. the words with count 1. Because we might lose hateful words this way

In [None]:
def preprocess_2(data):
  from textblob import TextBlob
  nltk.download('punkt')

  from textblob import Word
  nltk.download('wordnet')
  data['tweet'] = data['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
  data['tweet'].head()

In [None]:
preprocess_2(train)
preprocess_2(real_world_test)
preprocess_2(real_world_test_racist)
preprocess_2(pok)
preprocess_2(jigsaw)

In [None]:
# real_world_test = real_world_test[real_world_test['tweet'].apply(lambda x:len(x)>1)]

### Target creation

In [None]:
train['class'].unique()#.isna().sum()

In [None]:
train['class'].replace(['racism', 'sexism',0, 1, 'both', 'none', 'neither',2],['hate','hate','hate','hate','hate','null','null','null'],inplace=True)
train['class'].value_counts()

In [None]:
train['class'].replace(['null','hate'],[0,1],inplace=True)

dropping the duplicates

In [None]:
# hate_text = train[train['class']==1]['tweet']
# null_text = train[train['class']==0]['tweet']

In [None]:
# hate_text.to_csv(r'hate_speech.txt', header=None, index=None, sep=' ')

In [None]:
# null_text.to_csv(r'null_speech.txt', header=None, index=None, sep=' ')

In [None]:
sum(train['ID'].value_counts()>1)

In [None]:
train.shape

In [None]:
train = train.drop_duplicates(subset='ID')

real_world_test = real_world_test.drop_duplicates(subset='id')

### Basic Model on whole dataset

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

tfidf.fit(train['tweet'])

x_t = tfidf.transform(x_train)
x_v = tfidf.transform(x_valid)


In [None]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
      print('validation accuracy', clf.score(x_v,y_v))
      print('validation f1_score',f1_score(clf.predict(x_v),y_v))
      print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
      print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
        if x_v!=None:
            print('validation r2_score', clf.score(x_v,y_v))
            print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [None]:
x_train.iloc[0]

In [None]:
# y_t.value_counts()

19794/11810

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

# lgr =  LogisticRegression(n_jobs=1, C=1e5)
xgb = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
# xgb = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.6760372565622355)
model_training(xgb,x_t,y_t,x_v,y_v)


In [None]:
# training accuracy 0.8697316795342361
# validation accuracy 0.8521893191597064
# validation f1_score 0.8771819137749738
# validation roc_auc score 0.915704695718503
# confusion matrix 
#  [[2563  334]
#  [ 834 4171]]

In [None]:
# training accuracy 0.8751740286039742
# validation accuracy 0.8567451278157429
# validation f1_score 0.8815403934700711
# validation roc_auc score 0.9185450724629186
# [[2558  339]
#  [ 793 4212]]

In [None]:
# import joblib
# #save model
# joblib.dump(xgb, 'xgb_model_tfidf.pkl') 

### Training on new word2vec model

In [None]:
import gensim
import logging
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning.txt", binary=False)
# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning_including_stopwords.txt", binary=False)
wv.init_sims(replace=True)


In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

  
def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [None]:
from sklearn.model_selection import train_test_split

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 42)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

In [None]:
### out of time #1

real_world_test_sample = real_world_test.iloc[400:500]
real_world_tokenized = real_world_test_sample.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

out_of_time_test = word_averaging_list(wv,real_world_tokenized)

### out of time #2

real_world_test_sample_racist = real_world_test_racist.iloc[200:400]
real_world_tokenized_racist = real_world_test_sample_racist.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

out_of_time_test_racist = word_averaging_list(wv,real_world_tokenized_racist)


### out of time #3

pok_sample = pok.iloc[200:400]
pok_tokenized = pok_sample.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

pok_out_of_time = word_averaging_list(wv,pok_tokenized)

### out of time #4

jigsaw_testing = jigsaw.copy()
jigsaw_testing = jigsaw_testing.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

jigsaw_testing = word_averaging_list(wv,jigsaw_testing)


### Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [None]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
      print('validation accuracy', clf.score(x_v,y_v))
      print('validation f1_score',f1_score(clf.predict(x_v),y_v))
      print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
      print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
      print('validation r2_score', clf.score(x_v,y_v))
      print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [None]:
%%time
xgb_w2v = XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1, scale_pos_weight=1.4266790777602751)
model_training(xgb_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

In [None]:
print('validation accuracy', xgb_w2v.score(X_test_word_average,test_w2v['class']))
print('validation f1_score',f1_score(xgb_w2v.predict(X_test_word_average),test_w2v['class']))

In [None]:
# import joblib
# #save model
# joblib.dump(xgb_w2v, 'xgb_final_model.pkl') 

# # #load saved model
# # xgb = joblib.load(filename)

In [None]:
confusion_matrix(test_w2v['class'],xgb_w2v.predict(X_test_word_average))

In [None]:
tuned_pred = (xgb_w2v.predict_proba(X_test_word_average)[::,-1]>0.3).astype(int)
confusion_matrix(test_w2v['class'],tuned_pred)

In [None]:
f1_score(test_w2v['class'],tuned_pred)

In [None]:
test_w2v['class'].value_counts()

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
tfidf

#### Out of time #1 -  Trump Tweets

In [None]:
real_world_test_original = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/realDonaldTrump_tweets_extended.csv')

check_df = real_world_test_original.iloc[400:500]
check_df['processed_tweet'] = real_world_test.iloc[400:500]['tweet']
check_df['preds'] = xgb_w2v.predict(out_of_time_test)
check_df['hate_probab'] = xgb_w2v.predict_proba(out_of_time_test)[::,-1]

# false_index = check_df[check_df['class']==xgb_w2v.predict(X_test_word_average)]['ID'].unique()

In [None]:
check_df[check_df['hate_probab']>0.5]

#### Out of Time #2 -  Racist Tweets
P.S. Not all of them are racist actually, we need to check it manually

In [None]:
real_world_test_original_racist = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/racist_scarped_extended.csv', names=['timestamp','text'])

check_df_2 = real_world_test_original_racist.iloc[200:400]
check_df_2['processed_tweet'] = real_world_test_racist.iloc[200:400]['tweet']
check_df_2['preds'] = xgb_w2v.predict(out_of_time_test_racist)
check_df_2['hate_probab'] = xgb_w2v.predict_proba(out_of_time_test_racist)[::,-1]

# false_index = check_df[check_df['class']==xgb_w2v.predict(X_test_word_average)]['ID'].unique()

In [None]:
check_df_2[check_df_2['hate_probab']>0.4]

#### Out of time #3 - POK

In [None]:
pok_original = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/real_world_test/pok.csv', names=['timestamp','text'])
pok_original = pok_original.drop_duplicates(subset='text')

check_df_3 = pok_original.iloc[200:400]
check_df_3['processed_tweet'] = pok.iloc[200:400]['tweet']
check_df_3['preds'] = xgb_w2v.predict(pok_out_of_time)
check_df_3['hate_probab'] = xgb_w2v.predict_proba(pok_out_of_time)[::,-1]

# false_index = check_df[check_df['class']==xgb_w2v.predict(X_test_word_average)]['ID'].unique()

In [None]:
check_df_3[check_df_3['hate_probab']>0.4]

### Jigsaw

In [None]:

check_df_3 = jigsaw.copy()
check_df_3['processed_tweet'] = jigsaw['tweet']
check_df_3['preds'] = xgb_w2v.predict(jigsaw_testing)
check_df_3['hate_probab'] = xgb_w2v.predict_proba(jigsaw_testing)[::,-1]

# false_index = check_df[check_df['class']==xgb_w2v.predict(X_test_word_average)]['ID'].unique()

In [None]:
check_df_3['label'] = check_df_3.loc[:,'toxic':'identity_hate'].sum(axis=1)

In [None]:
index = check_df_3[check_df_3['label']>0].index
check_df_3.loc[index,'label']=1

In [None]:
f1_score((check_df_3['hate_probab']>0.4).astype(int),check_df_3['label'])

In [None]:
confusion_matrix((check_df_3['hate_probab']>0.4).astype(int),check_df_3['label'])

In [None]:
accuracy_score((check_df_3['hate_probab']>0.7).astype(int),check_df_3['label'])

In [None]:
check_df_3['label'].value_counts()

In [None]:
check_df_3[check_df_3['preds']!=check_df_3['label']]#['tweet']