Analysis on scraped dataset

### Import

In [0]:
import pandas as pd
import numpy as np

In [0]:
import pickle
import sys
import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [80]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cf_data_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NAACL_SRW.csv',encoding = "ISO-8859-1")
cf_data_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NLP+CSS.csv')

## this is the scraped data

In [82]:
labels_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NAACL_SRW_2016.csv',header=None,names=['ID','class'])
labels_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NLP+CSS_2016.csv',sep='\s')

  


In [83]:
labels_2.rename_axis({'TweetID':'ID','Expert':'class'},axis=1,inplace=True)

  """Entry point for launching an IPython kernel.


### First lets explore cf_data_1

In [84]:
cf_data_1.rename_axis({'Unnamed: 0':'index_col'},axis=1,inplace=True)
cf_data_2.rename_axis({'Unnamed: 0':'index_col'},axis=1,inplace=True)

  """Entry point for launching an IPython kernel.
  


#### Merging the cf_data_1 with its labelss

### Function for merging

In [0]:
def label_merging(data, labels):
    labels['ID'] = labels['ID'].astype(int)
    print(labels['ID'].nunique())
    print('Null IDs in data 1 = ' ,data['ID'].isna().sum())
    
    data['ID'].fillna(0,inplace=True)
    data['ID'] = data['ID'].astype(int)
    
    print('data shape ='  ,data.shape)
    print('IDs common in data and labels =',sum(data['ID'].isin(labels['ID'])))
    
    train = data.merge(labels, on='ID',how='inner')#['class'].isna().sum()
    return train

In [86]:
train_1 = label_merging(cf_data_1,labels_1)

16849
Null IDs in data 1 =  2
data shape = (16037, 11)
IDs common in data and labels = 11238


In [87]:
train_2 = label_merging(cf_data_2, labels_2)

6909
Null IDs in data 1 =  0
data shape = (6271, 11)
IDs common in data and labels = 6271


In [88]:
train_1['class'].value_counts()

none      7733
sexism    2258
racism    1319
Name: class, dtype: int64

In [89]:
train_2['class'].value_counts()

neither    5526
sexism      639
racism       80
both         26
Name: class, dtype: int64

In [0]:
t1 = train_1[['ID','Tweets','class']]
t2 = train_2[['ID','Tweets','class']]
merged = pd.concat([t1,t2],axis=0).reset_index(drop=True)

### Target Analysis

In [91]:
merged['class'].value_counts()

none       7733
neither    5526
sexism     2897
racism     1399
both         26
Name: class, dtype: int64

### Very basic exploration

In [0]:
train = merged.copy()

In [0]:
train.rename(columns={'Tweets':'tweet'},inplace=True)

In [0]:
train['tweet'] = train['tweet'].astype(str)

In [95]:
## word and char count, avg_word length and all
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,"@ummayman90 Again, your entire concept of god ...",20
1,"@anjemchoudary Your prophet was a rapist, murd...",21
2,RT @DilanaKurdi: Yazidi children who are taken...,17
3,These girls are the equivalent of the irritati...,18
4,@lauracdean I love how the Islamofascists recr...,24


In [96]:
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,"@ummayman90 Again, your entire concept of god ...",133
1,"@anjemchoudary Your prophet was a rapist, murd...",140
2,RT @DilanaKurdi: Yazidi children who are taken...,135
3,These girls are the equivalent of the irritati...,99
4,@lauracdean I love how the Islamofascists recr...,133


In [97]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,"@ummayman90 Again, your entire concept of god ...",5.7
1,"@anjemchoudary Your prophet was a rapist, murd...",5.714286
2,RT @DilanaKurdi: Yazidi children who are taken...,7.0
3,These girls are the equivalent of the irritati...,4.555556
4,@lauracdean I love how the Islamofascists recr...,4.583333


#### Hashtags

In [98]:
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,"@ummayman90 Again, your entire concept of god ...",1
1,"@anjemchoudary Your prophet was a rapist, murd...",0
2,RT @DilanaKurdi: Yazidi children who are taken...,2
3,These girls are the equivalent of the irritati...,1
4,@lauracdean I love how the Islamofascists recr...,0


### Basic Preprocessing

Every word followed by @ is some twitter ID of an user, which shouldn't be considered in our analysis, so lets do the stemming, where we remove @ alonwith the word followed by it

#### Removing user IDs from text

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('@')==-1]))

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('http')==-1]))
## removing http links

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:''.join([i for i in x if not i.isdigit()]))
## removing number

#### Converting to Lower case

In [101]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    again, your entire concept of god corresponds ...
1    your prophet was a rapist, murderer, pedophile...
2    rt yazidi children who are taken from their pa...
3    these girls are the equivalent of the irritati...
4    i love how the islamofascists recruit 14 and 1...
Name: tweet, dtype: object

There is no use of punctuation mostly, so removing it

In [102]:
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    again your entire concept of god corresponds t...
1    your prophet was a rapist murderer pedophile c...
2    rt yazidi children who are taken from their pa...
3    these girls are the equivalent of the irritati...
4    i love how the islamofascists recruit 14 and 1...
Name: tweet, dtype: object

#### Removing stopwords

In [103]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [104]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    rt yazidi children taken parents forcibly conv...
3    girls equivalent irritating asian girls couple...
4    love islamofascists recruit 14 15 year old jih...
Name: tweet, dtype: object

#### Most common and rare words

In [105]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()#[:10]
freq

mkr                    3574
rt                     2894
im                     1541
like                   1045
dont                    965
women                   831
people                  790
sexist                  778
get                     693
kat                     663
one                     614
amp                     608
islam                   580
think                   566
know                    537
would                   535
feminazi                534
really                  463
time                    459
cant                    429
want                    421
going                   415
good                    410
go                      393
make                    379
ive                     378
see                     378
girls                   371
oh                      359
men                     345
                       ... 
khadgar                   1
rentcar                   1
boris                     1
secularism                1
laidback            

rt is a very frequent word occuring almost in every tweet, and it doesnt hold any meaning so lets just remove it, I guess it there are such more words but those are all seem to be hatespeech. So we will just keep hate speechwords from above list, lets remove the rest

In [0]:
remove_word = ['rt','mkr','im','dont','like','people','get','think','would','cant']

In [107]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_word))
train['tweet'].head()

0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    yazidi children taken parents forcibly convert...
3    girls equivalent irritating asian girls couple...
4    love islamofascists recruit 14 15 year old jih...
Name: tweet, dtype: object

In [108]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()#[-20000:]
freq

women                    831
sexist                   778
kat                      663
one                      614
amp                      608
islam                    580
know                     537
feminazi                 534
really                   463
time                     459
want                     421
going                    415
good                     410
go                       393
make                     379
ive                      378
see                      378
girls                    371
oh                       359
men                      345
thats                    340
need                     334
even                     331
isis                     326
andre                    321
still                    318
muslims                  317
youre                    311
never                    310
well                     307
                        ... 
watery                     1
homophobic                 1
owing                      1
mkr15         

In [109]:
freq.shape

(16527,)

Doesnt really make sense to remove rare words, i.e. the words with count 1. Because we might lose hateful words this way

In [110]:
from textblob import TextBlob
nltk.download('punkt')
TextBlob(train['tweet'][1]).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


WordList(['prophet', 'rapist', 'murderer', 'pedophile', 'caravan', 'robber', 'slave', 'trader', 'bigot', 'sexist', 'god', 'never', 'use', 'scum'])

#### Lemmetization
smart stemming

In [111]:
from textblob import Word
nltk.download('wordnet')
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    yazidi child taken parent forcibly converted i...
3    girl equivalent irritating asian girl couple y...
4    love islamofascists recruit 14 15 year old jih...
Name: tweet, dtype: object

##### So now it seems that the data is almost clean! Lets explore advanced preprocessing/ text representation techniques

### Target creation

In [112]:
train['class'].unique()#.isna().sum()

array(['racism', 'none', 'sexism', 'neither', 'both'], dtype=object)

In [113]:
train['class'].replace(['neither','none', 'sexism', 'racism', 'both'],[0,0,1,1,1],inplace=True)
train['class'].value_counts()

0    13259
1     4322
Name: class, dtype: int64

### N-Grams

In [114]:
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['entire', 'concept']),
 WordList(['concept', 'god']),
 WordList(['god', 'corresponds']),
 WordList(['corresponds', 'tyrannical']),
 WordList(['tyrannical', 'earthly']),
 WordList(['earthly', 'egomaniac']),
 WordList(['egomaniac', 'simple']),
 WordList(['simple', 'stupid']),
 WordList(['stupid', 'islam'])]

### TF-IDF

In [0]:


from sklearn.model_selection import train_test_split
x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

tfidf.fit(train['tweet'])

x_t = tfidf.transform(x_t)
x_v = tfidf.transform(x_v)


In [117]:
1 - y_v.mean()

0.745806084731305

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [0]:
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=15)

In [0]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
        if x_v!=None:
            print('validation accuracy', clf.score(x_v,y_v))
            print('validation f1_score',f1_score(clf.predict(x_v),y_v))
            print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
            print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
        if x_v!=None:
            print('validation r2_score', clf.score(x_v,y_v))
            print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [122]:
model_training(clf,x_t,y_t,x_v,y_v)
## Gradient boosting

training accuracy 0.8919937428896473
validation accuracy 0.8396360534546489
validation f1_score 0.6279683377308708
validation roc_auc score 0.8121615616798907
confusion matrix 
 [[2477  146]
 [ 418  476]]


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=15,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [123]:
preds = (clf.predict_proba(x_v)[::,-1]>0.3).astype(int)

confusion_matrix(y_v,preds)

array([[2407,  216],
       [ 356,  538]])

In [124]:
accuracy_score(y_v,preds)

0.8373613875462042

In [125]:

f1_score(y_v,preds)

0.6529126213592233

In [0]:
# training accuracy 0.9344425483503982
# validation accuracy 0.8589707136764287
# validation f1_score 0.6888331242158092
# validation roc_auc score 0.86558289643926
# confusion matrix 
#  [[2472  151]
#  [ 345  549]]
# GradientBoostingClassifier(criterion='friedman_mse', init=None,
#                            learning_rate=0.1, loss='deviance', max_depth=15,
#                            max_features=None, max_leaf_nodes=None,
#                            min_impurity_decrease=0.0, min_impurity_split=None,
#                            min_samples_leaf=1, min_samples_split=2,
#                            min_weight_fraction_leaf=0.0, n_estimators=100,
#                            n_iter_no_change=None, presort='auto',
#                            random_state=None, subsample=1.0, tol=0.0001,
#                            validation_fraction=0.1, verbose=0,
#                            warm_start=False)

In [127]:
lgr =  LogisticRegression(n_jobs=1, C=1e5)
model_training(lgr,x_t,y_t,x_v,y_v)

training accuracy 0.8386660978384528
validation accuracy 0.8348023883992038
validation f1_score 0.6149768058316766
validation roc_auc score 0.8119325601011872
confusion matrix 
 [[2472  151]
 [ 430  464]]




LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [128]:
print(classification_report(clf.predict(x_v),y_v))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90      2895
           1       0.53      0.77      0.63       622

    accuracy                           0.84      3517
   macro avg       0.74      0.81      0.76      3517
weighted avg       0.87      0.84      0.85      3517



In [129]:
probabs_df = pd.DataFrame(y_v.reset_index(drop=True))
probabs_df['probab'] = lgr.predict_proba(x_v)[::,-1]
probabs_df['preds'] = 0

probabs_df = probabs_df.sort_values(by = 'probab',ascending=False).reset_index(drop=True)
probabs_df.loc[0:int(x_v.todense().shape[0]*y_v.mean()),'preds'] = 1
probabs_df.loc[int(x_v.todense().shape[0]*y_v.mean()):,'preds'] = 0
np.mean(probabs_df['class']==probabs_df['preds'])

0.8348023883992038

In [130]:
confusion_matrix(probabs_df['class'],probabs_df['preds'])

array([[2333,  290],
       [ 291,  603]])

In [131]:
f1_score(probabs_df['class'],probabs_df['preds'])

0.6748740906547286

In [0]:
### Woah thats quite an improvement, lets lock on this strategy

### Word2Vec

In [0]:
import gensim
import logging

In [151]:
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz", binary=True)
# wv = gensim. models.KeyedVectors.load("/content/drive/My Drive/Hate_Speech_Detection_git/hate_embeddings#1.kv")
# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_latest.txt", binary=False)

# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning.txt", binary=False)

wv.init_sims(replace=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [147]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 42)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [149]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train_w2v['class'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test_w2v['class']))
print(classification_report(test_w2v['class'], y_pred))
print(confusion_matrix(test_w2v['class'],y_pred))



accuracy 0.8254193915268695
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      2652
           1       0.69      0.53      0.60       865

    accuracy                           0.83      3517
   macro avg       0.77      0.73      0.74      3517
weighted avg       0.82      0.83      0.82      3517

[[2444  208]
 [ 406  459]]


In [0]:
accuracy 0.800966733011089
              precision    recall  f1-score   support

           0       0.82      0.93      0.88      2652
           1       0.66      0.39      0.49       865

    accuracy                           0.80      3517
   macro avg       0.74      0.66      0.68      3517
weighted avg       0.78      0.80      0.78      3517

[[2479  173]
 [ 527  338]]

In [150]:
f1_score(test_w2v['class'],y_pred)
##  10% increament

0.5992167101827677

In [0]:
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=10)#trees, 
model_training(clf,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])