Analysis on scraped dataset

### Import

In [None]:
import pandas as pd
import numpy as np

In [None]:
import pickle
import sys
import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cf_data_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NAACL_SRW.csv',encoding = "ISO-8859-1")
cf_data_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NLP+CSS.csv')

## this is the scraped data

In [None]:
labels_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NAACL_SRW_2016.csv',header=None,names=['ID','class'])
labels_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NLP+CSS_2016.csv',sep='\s')

In [None]:
labels_2.rename_axis({'TweetID':'ID','Expert':'class'},axis=1,inplace=True)

### First lets explore cf_data_1

In [None]:
cf_data_1.rename_axis({'Unnamed: 0':'index_col'},axis=1,inplace=True)
cf_data_2.rename_axis({'Unnamed: 0':'index_col'},axis=1,inplace=True)

#### Merging the cf_data_1 with its labelss

### Function for merging

In [None]:
def label_merging(data, labels):
    labels['ID'] = labels['ID'].astype(int)
    print(labels['ID'].nunique())
    print('Null IDs in data 1 = ' ,data['ID'].isna().sum())
    
    data['ID'].fillna(0,inplace=True)
    data['ID'] = data['ID'].astype(int)
    
    print('data shape ='  ,data.shape)
    print('IDs common in data and labels =',sum(data['ID'].isin(labels['ID'])))
    
    train = data.merge(labels, on='ID',how='inner')#['class'].isna().sum()
    return train

In [None]:
train_1 = label_merging(cf_data_1,labels_1)

In [None]:
train_2 = label_merging(cf_data_2, labels_2)

In [None]:
train_1['class'].value_counts()

In [None]:
train_2['class'].value_counts()

In [None]:
t1 = train_1[['ID','Tweets','class']]
t2 = train_2[['ID','Tweets','class']]
merged = pd.concat([t1,t2],axis=0).reset_index(drop=True)

### Target Analysis

In [None]:
merged['class'].value_counts()

### Very basic exploration

In [None]:
train = merged.copy()

In [None]:
train.rename(columns={'Tweets':'tweet'},inplace=True)

In [None]:
train['tweet'] = train['tweet'].astype(str)

In [None]:
## word and char count, avg_word length and all
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

In [None]:
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

In [None]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

#### Hashtags

In [None]:
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

### Basic Preprocessing

Every word followed by @ is some twitter ID of an user, which shouldn't be considered in our analysis, so lets do the stemming, where we remove @ alonwith the word followed by it

#### Removing user IDs from text

In [None]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('@')==-1]))

In [None]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('http')==-1]))
## removing http links

In [None]:
train['tweet'] = train['tweet'].apply(lambda x:''.join([i for i in x if not i.isdigit()]))
## removing number

#### Converting to Lower case

In [None]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

There is no use of punctuation mostly, so removing it

In [None]:
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

#### Removing stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

#### Most common and rare words

In [None]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()#[:10]
freq

rt is a very frequent word occuring almost in every tweet, and it doesnt hold any meaning so lets just remove it, I guess it there are such more words but those are all seem to be hatespeech. So we will just keep hate speechwords from above list, lets remove the rest

In [None]:
remove_word = ['rt','mkr','im','dont','like','people','get','think','would','cant']

In [None]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_word))
train['tweet'].head()

In [None]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()#[-20000:]
freq

In [None]:
freq.shape

Doesnt really make sense to remove rare words, i.e. the words with count 1. Because we might lose hateful words this way

In [None]:
from textblob import TextBlob
nltk.download('punkt')
TextBlob(train['tweet'][1]).words

#### Lemmetization
smart stemming

In [None]:
from textblob import Word
nltk.download('wordnet')
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

##### So now it seems that the data is almost clean! Lets explore advanced preprocessing/ text representation techniques

### Target creation

In [None]:
train['class'].unique()#.isna().sum()

In [None]:
train['class'].replace(['neither','none', 'sexism', 'racism', 'both'],[0,0,1,1,1],inplace=True)
train['class'].value_counts()

### N-Grams

In [None]:
TextBlob(train['tweet'][0]).ngrams(2)

### TF-IDF

In [None]:


from sklearn.model_selection import train_test_split
x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

tfidf.fit(train['tweet'])

x_t = tfidf.transform(x_t)
x_v = tfidf.transform(x_v)


In [None]:
1 - y_v.mean()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [None]:
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=15)

In [None]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
        if x_v!=None:
            print('validation accuracy', clf.score(x_v,y_v))
            print('validation f1_score',f1_score(clf.predict(x_v),y_v))
            print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
            print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
        if x_v!=None:
            print('validation r2_score', clf.score(x_v,y_v))
            print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [None]:
model_training(clf,x_t,y_t,x_v,y_v)
## Gradient boosting

In [None]:
preds = (clf.predict_proba(x_v)[::,-1]>0.3).astype(int)

confusion_matrix(y_v,preds)

In [None]:
accuracy_score(y_v,preds)

In [None]:

f1_score(y_v,preds)

In [None]:
# training accuracy 0.9344425483503982
# validation accuracy 0.8589707136764287
# validation f1_score 0.6888331242158092
# validation roc_auc score 0.86558289643926
# confusion matrix 
#  [[2472  151]
#  [ 345  549]]
# GradientBoostingClassifier(criterion='friedman_mse', init=None,
#                            learning_rate=0.1, loss='deviance', max_depth=15,
#                            max_features=None, max_leaf_nodes=None,
#                            min_impurity_decrease=0.0, min_impurity_split=None,
#                            min_samples_leaf=1, min_samples_split=2,
#                            min_weight_fraction_leaf=0.0, n_estimators=100,
#                            n_iter_no_change=None, presort='auto',
#                            random_state=None, subsample=1.0, tol=0.0001,
#                            validation_fraction=0.1, verbose=0,
#                            warm_start=False)

In [None]:
lgr =  LogisticRegression(n_jobs=1, C=1e5)
model_training(lgr,x_t,y_t,x_v,y_v)

In [None]:
print(classification_report(clf.predict(x_v),y_v))

In [None]:
probabs_df = pd.DataFrame(y_v.reset_index(drop=True))
probabs_df['probab'] = lgr.predict_proba(x_v)[::,-1]
probabs_df['preds'] = 0

probabs_df = probabs_df.sort_values(by = 'probab',ascending=False).reset_index(drop=True)
probabs_df.loc[0:int(x_v.todense().shape[0]*y_v.mean()),'preds'] = 1
probabs_df.loc[int(x_v.todense().shape[0]*y_v.mean()):,'preds'] = 0
np.mean(probabs_df['class']==probabs_df['preds'])

In [None]:
confusion_matrix(probabs_df['class'],probabs_df['preds'])

In [None]:
f1_score(probabs_df['class'],probabs_df['preds'])

In [None]:
### Woah thats quite an improvement, lets lock on this strategy

### Word2Vec

In [None]:
import gensim
import logging

In [None]:
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz", binary=True)
# wv = gensim. models.KeyedVectors.load("/content/drive/My Drive/Hate_Speech_Detection_git/hate_embeddings#1.kv")
# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_latest.txt", binary=False)

# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning.txt", binary=False)

wv.init_sims(replace=True)

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [None]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 42)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train_w2v['class'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test_w2v['class']))
print(classification_report(test_w2v['class'], y_pred))
print(confusion_matrix(test_w2v['class'],y_pred))

In [None]:
accuracy 0.800966733011089
              precision    recall  f1-score   support

           0       0.82      0.93      0.88      2652
           1       0.66      0.39      0.49       865

    accuracy                           0.80      3517
   macro avg       0.74      0.66      0.68      3517
weighted avg       0.78      0.80      0.78      3517

[[2479  173]
 [ 527  338]]

In [None]:
f1_score(test_w2v['class'],y_pred)
##  10% increament

In [None]:
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=10)#trees, 
model_training(clf,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])