There is a change in preprocessing in this notebook, we removed all the numbers from tweets, which helped in training more robust word2vec embeddings

Analysis on scraped dataset

### Import

In [0]:
import pandas as pd
import numpy as np

In [0]:
import pickle
import sys
import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cf_data_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NAACL_SRW.csv',encoding = "ISO-8859-1")
cf_data_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/hatespeech_NLP+CSS.csv')

cf_data_3 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_2/labeled_data.csv',encoding = "ISO-8859-1")
## this is the scraped data

cf_data_3.rename({'Unnamed: 0':'ID','tweet':'Tweets'},axis=1,inplace=True)

labels_1 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NAACL_SRW_2016.csv',header=None,names=['ID','class'])
labels_2 = pd.read_csv('/content/drive/My Drive/Hate_Speech_Detection_git/data_1/NLP+CSS_2016.csv',sep='\s')

labels_2.rename({'TweetID':'ID','Expert':'class'},axis=1,inplace=True)

cf_data_1.rename({'Unnamed: 0':'index_col'},axis=1,inplace=True)
cf_data_2.rename({'Unnamed: 0':'index_col'},axis=1,inplace=True)

  # Remove the CWD from sys.path while we load stuff.


#### Merging the cf_data_1 with its labelss

### Function for merging

In [0]:
def label_merging(data, labels):
    labels['ID'] = labels['ID'].astype(int)
    print(labels['ID'].nunique())
    print('Null IDs in data 1 = ' ,data['ID'].isna().sum())
    
    data['ID'].fillna(0,inplace=True)
    data['ID'] = data['ID'].astype(int)
    
    print('data shape ='  ,data.shape)
    print('IDs common in data and labels =',sum(data['ID'].isin(labels['ID'])))
    
    train = data.merge(labels, on='ID',how='inner')#['class'].isna().sum()
    return train

In [0]:
train_1 = label_merging(cf_data_1,labels_1)

16849
Null IDs in data 1 =  2
data shape = (16037, 11)
IDs common in data and labels = 11238


In [0]:
train_2 = label_merging(cf_data_2, labels_2)

6909
Null IDs in data 1 =  0
data shape = (6271, 11)
IDs common in data and labels = 6271


In [0]:
train_3 = cf_data_3.copy()

In [0]:
train_1['class'].value_counts()

none      7733
sexism    2258
racism    1319
Name: class, dtype: int64

In [0]:
train_2['class'].value_counts()

neither    5526
sexism      639
racism       80
both         26
Name: class, dtype: int64

In [0]:
train_3['class'].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [0]:
t1 = train_1[['ID','Tweets','class']]
t2 = train_2[['ID','Tweets','class']]
t3 = train_3[['ID','Tweets','class']]
merged = pd.concat([t1,t2,t3],axis=0).reset_index(drop=True)

### Target Analysis

In [0]:
merged['class'].value_counts()

1          19190
none        7733
neither     5526
2           4163
sexism      2897
0           1430
racism      1399
both          26
Name: class, dtype: int64

### Very basic exploration

In [0]:
train = merged.copy()

In [0]:
train.rename(columns={'Tweets':'tweet'},inplace=True)

In [0]:
train['tweet'] = train['tweet'].astype(str)

In [0]:
## word and char count, avg_word length and all
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,"@ummayman90 Again, your entire concept of god ...",20
1,"@anjemchoudary Your prophet was a rapist, murd...",21
2,RT @DilanaKurdi: Yazidi children who are taken...,17
3,These girls are the equivalent of the irritati...,18
4,@lauracdean I love how the Islamofascists recr...,24


In [0]:
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,"@ummayman90 Again, your entire concept of god ...",133
1,"@anjemchoudary Your prophet was a rapist, murd...",140
2,RT @DilanaKurdi: Yazidi children who are taken...,135
3,These girls are the equivalent of the irritati...,99
4,@lauracdean I love how the Islamofascists recr...,133


In [0]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,"@ummayman90 Again, your entire concept of god ...",5.7
1,"@anjemchoudary Your prophet was a rapist, murd...",5.714286
2,RT @DilanaKurdi: Yazidi children who are taken...,7.0
3,These girls are the equivalent of the irritati...,4.555556
4,@lauracdean I love how the Islamofascists recr...,4.583333


#### Hashtags

In [0]:
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,"@ummayman90 Again, your entire concept of god ...",1
1,"@anjemchoudary Your prophet was a rapist, murd...",0
2,RT @DilanaKurdi: Yazidi children who are taken...,2
3,These girls are the equivalent of the irritati...,1
4,@lauracdean I love how the Islamofascists recr...,0


### Basic Preprocessing

Every word followed by @ is some twitter ID of an user, which shouldn't be considered in our analysis, so lets do the stemming, where we remove @ alonwith the word followed by it

#### Removing user IDs from text

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('@')==-1]))

Removing all the links from tweets

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:' '.join(i for i in [a for a in x.split() if a.find('http')==-1]))

Removing numbers

In [0]:
train['tweet'] = train['tweet'].apply(lambda x:''.join([i for i in x if not i.isdigit()]))

#### Converting to Lower case

In [0]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    again, your entire concept of god corresponds ...
1    your prophet was a rapist, murderer, pedophile...
2    rt yazidi children who are taken from their pa...
3    these girls are the equivalent of the irritati...
4    i love how the islamofascists recruit and year...
Name: tweet, dtype: object

There is no use of punctuation mostly, so removing it

In [0]:
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    again your entire concept of god corresponds t...
1    your prophet was a rapist murderer pedophile c...
2    rt yazidi children who are taken from their pa...
3    these girls are the equivalent of the irritati...
4    i love how the islamofascists recruit and year...
Name: tweet, dtype: object

#### Removing stopwords

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    rt yazidi children taken parents forcibly conv...
3    girls equivalent irritating asian girls couple...
4    love islamofascists recruit year old jihadis t...
Name: tweet, dtype: object

#### Most common and rare words

In [0]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()#[:10]
freq

rt             10506
bitch           8317
like            3815
mkr             3788
im              3631
               ...  
readlisten         1
ar                 1
succession         1
xans               1
holipalooza        1
Length: 30398, dtype: int64

rt is a very frequent word occuring almost in every tweet, and it doesnt hold any meaning so lets just remove it, I guess it there are such more words but those are all seem to be hatespeech. So we will just keep hate speechwords from above list, lets remove the rest

In [0]:
remove_word = ['rt','mkr','im']

In [0]:
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in remove_word))
train['tweet'].head()

0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    yazidi children taken parents forcibly convert...
3    girls equivalent irritating asian girls couple...
4    love islamofascists recruit year old jihadis t...
Name: tweet, dtype: object

In [0]:
freq.shape

(30398,)

Doesnt really make sense to remove rare words, i.e. the words with count 1. Because we might lose hateful words this way

In [0]:
from textblob import TextBlob
nltk.download('punkt')
TextBlob(train['tweet'][1]).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


WordList(['prophet', 'rapist', 'murderer', 'pedophile', 'caravan', 'robber', 'slave', 'trader', 'bigot', 'sexist', 'god', 'would', 'never', 'use', 'scum'])

#### Lemmetization
smart stemming

In [0]:
from textblob import Word
nltk.download('wordnet')
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    entire concept god corresponds tyrannical eart...
1    prophet rapist murderer pedophile caravan robb...
2    yazidi child taken parent forcibly converted i...
3    girl equivalent irritating asian girl couple y...
4    love islamofascists recruit year old jihadis t...
Name: tweet, dtype: object

##### So now it seems that the data is almost clean! Lets explore advanced preprocessing/ text representation techniques

### Target creation

In [0]:
train['class'].unique()#.isna().sum()

array(['racism', 'none', 'sexism', 'neither', 'both', 2, 1, 0],
      dtype=object)

In [0]:
train['class'].replace(['racism', 'sexism',0, 1, 'both', 'none', 'neither',2],['hate','hate','hate','hate','hate','null','null','null'],inplace=True)
train['class'].value_counts()

hate    24942
null    17422
Name: class, dtype: int64

In [0]:
train['class'].replace(['null','hate'],[0,1],inplace=True)

### N-Grams

In [0]:
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['entire', 'concept']),
 WordList(['concept', 'god']),
 WordList(['god', 'corresponds']),
 WordList(['corresponds', 'tyrannical']),
 WordList(['tyrannical', 'earthly']),
 WordList(['earthly', 'egomaniac']),
 WordList(['egomaniac', 'simple']),
 WordList(['simple', 'stupid']),
 WordList(['stupid', 'islam'])]

### Removing Duplicates

In [0]:
train = train.drop_duplicates(subset='ID')

### Basic Model on whole dataset

In [0]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

tfidf.fit(train['tweet'])

x_t = tfidf.transform(x_train)
x_v = tfidf.transform(x_valid)


In [0]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
      print('validation accuracy', clf.score(x_v,y_v))
      print('validation f1_score',f1_score(clf.predict(x_v),y_v))
      print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
      print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
      print('validation r2_score', clf.score(x_v,y_v))
      print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [0]:
# y_t.value_counts()

13966/19925

0.7009284818067754

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

lgr =  LogisticRegression(n_jobs=1)
xgb = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)

model_training(xgb,x_t,y_t,x_v,y_v)
# model_training(lgr,x_t,y_t,x_v,y_v)

training accuracy 0.857264903176813
validation accuracy 0.8553530751708428
validation f1_score 0.8783654357773757
validation roc_auc score 0.9169276701896655
confusion matrix 
 [[2632  265]
 [ 878 4127]]




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# ## XGB results
# training accuracy 0.8751740286039742
# validation accuracy 0.8567451278157429
# validation f1_score 0.8815403934700711
# validation roc_auc score 0.9185450724629186
# confusion matrix 
#  [[2558  339]
#  [ 793 4212]]

# LR Results
#training accuracy 0.857264903176813
# validation accuracy 0.8553530751708428
# validation f1_score 0.8783654357773757
# validation roc_auc score 0.9169276701896655
# confusion matrix 
#  [[2632  265]
#  [ 878 4127]]

In [0]:
y_v.value_counts()

1    5005
0    2897
Name: class, dtype: int64

In [0]:
#### We are getting good results over here, lets just also check whether the predictions using ranks would work here or not. 
#### I guess it wont work as here 1s are in majority, so rare chances of any improvement

In [0]:
y_v.mean()

0.6333839534295115

In [0]:
# probabs_df = pd.DataFrame(y_v)
# probabs_df['original_index'] = y_v.index
# probabs_df['probab'] = lgr.predict_proba(x_v)[::,-1]
# probabs_df['preds'] = (lgr.predict_proba(x_v)[::,-1]>0.5).astype(int)

probabs_df = pd.DataFrame(y_v)
probabs_df['original_index'] = y_v.index
# probabs_df['probab'] = lgr.predict_proba(x_v)[::,-1]
probabs_df['probab'] = xgb.predict_proba(x_v)[::,-1]


probabs_df = probabs_df.sort_values(by = 'probab',ascending=False).reset_index(drop=True)

probabs_df.loc[0:int(x_v.todense().shape[0]*y_v.mean()),'preds'] = 1

probabs_df.loc[int(x_v.todense().shape[0]*y_v.mean()):,'preds'] = 0

np.mean(probabs_df['class']==probabs_df['preds'])

### True Positives increased but as said, it costed us the overall accuracy,
### even the F1 score isnt increased this, which had imprved in NLP+CSS model

0.8364215744128408

In [0]:
confusion_matrix(probabs_df['class'],probabs_df['preds'])

array([[2763,  693],
       [ 693, 4324]])

In [0]:
f1_score(probabs_df['class'],probabs_df['preds'])

0.8618696432130756

In [0]:
bad_preds = x_valid.loc[probabs_df[probabs_df['class']!=probabs_df['preds']]['original_index'].values]

In [0]:
original_class = probabs_df[probabs_df['class']!=probabs_df['preds']][['class','original_index']]
pred_class = probabs_df[probabs_df['class']!=probabs_df['preds']][['preds','probab']]

### Some serious mindf*ck here

In [0]:
train.loc[39516]

ID                                22402
tweet         bitch buy sparkling water
class                                 0
word_count                            6
char_count                           45
avg_word                        6.66667
hastags                               0
Name: 39516, dtype: object

In [0]:
pd.concat([bad_preds.reset_index(drop=True),original_class.reset_index(drop=True),pred_class.reset_index(drop=True)],axis=1)

### Word Embeddings Training

In [0]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
def read_input(data):
    """This method reads the input file which is in gzip format"""
    i=0
    for line in data['tweet']: 
      i+=1
      if (i%10000==0):
        logging.info ("read {0} tweets".format(i))
      # do some pre-processing and return a list of words for each tweet, basically doing tokenizing
      yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(train))
logging.info ("Done reading data file")

2019-11-09 22:29:54,810 : INFO : read 10000 tweets
2019-11-09 22:29:54,900 : INFO : read 20000 tweets
2019-11-09 22:29:54,985 : INFO : read 30000 tweets
2019-11-09 22:29:55,084 : INFO : Done reading data file


In [0]:
documents = []
i=0
for line in train['tweet']: 
    i+=1
    if (i%10000==0):
      logging.info ("read {0} tweets".format(i))
      # do some pre-processing and return a list of words for each tweet, basically doing tokenizing
    # documents.append(gensim.utils.simple_preprocess(line))
    documents.append(nltk.word_tokenize(line))

2019-11-09 22:29:57,197 : INFO : read 10000 tweets
2019-11-09 22:29:57,995 : INFO : read 20000 tweets
2019-11-09 22:29:58,810 : INFO : read 30000 tweets


In [0]:
len(documents)

39506

In [0]:
%%time

model = gensim.models.Word2Vec(documents, size=50, min_count=2, workers=5)
model.train(documents,total_examples=len(documents),epochs=10)



2019-11-09 22:30:00,216 : INFO : collecting all words and their counts
2019-11-09 22:30:00,218 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-09 22:30:00,240 : INFO : PROGRESS: at sentence #10000, processed 73710 words, keeping 11173 word types
2019-11-09 22:30:00,258 : INFO : PROGRESS: at sentence #20000, processed 147079 words, keeping 18444 word types
2019-11-09 22:30:00,277 : INFO : PROGRESS: at sentence #30000, processed 224191 words, keeping 23575 word types
2019-11-09 22:30:00,296 : INFO : collected 27600 word types from a corpus of 297884 raw words and 39506 sentences
2019-11-09 22:30:00,296 : INFO : Loading a fresh vocabulary
2019-11-09 22:30:00,324 : INFO : effective_min_count=2 retains 12387 unique words (44% of original 27600, drops 15213)
2019-11-09 22:30:00,324 : INFO : effective_min_count=2 leaves 282671 word corpus (94% of original 297884, drops 15213)
2019-11-09 22:30:00,358 : INFO : deleting the raw counts dictionary of 27600 items


CPU times: user 12.4 s, sys: 82.8 ms, total: 12.5 s
Wall time: 7.75 s


In [0]:
# word_vectors = model.wv

# model.wv.save_word2vec_format('model.txt', binary=False)

In [0]:
# from gensim.test.utils import get_tmpfile
# from gensim.models import KeyedVectors

# fname = "hate_embeddings#1.kv"
# word_vectors.save(fname)
# word_vectors = KeyedVectors.load(fname, mmap='r')

In [0]:
w1 = "muslim"
model.wv.most_similar(positive=w1)

2019-11-09 22:30:50,898 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('islam', 0.9447076320648193),
 ('religion', 0.9309201240539551),
 ('quran', 0.9206629395484924),
 ('prophet', 0.9146885275840759),
 ('jew', 0.9033626914024353),
 ('mohammed', 0.8901999592781067),
 ('pedophile', 0.883843183517456),
 ('islamic', 0.8814033269882202),
 ('terrorist', 0.8811196684837341),
 ('propaganda', 0.878230094909668)]

In [0]:
model.wv.similarity(w1='queen',w2='royal')

  if np.issubdtype(vec.dtype, np.int):


0.8024919

In [0]:
model.wv.similarity(w1='muslim',w2='islam')

  if np.issubdtype(vec.dtype, np.int):


0.94470763

In [0]:
# [('muslim', 0.9584801197052002),
#  ('religion', 0.9454779028892517),
#  ('quran', 0.9349233508110046),
#  ('prophet', 0.9214071035385132),
#  ('declaration', 0.9209208488464355),
#  ('murder', 0.8991195559501648),
#  ('mohammed', 0.8898417353630066),
#  ('jew', 0.8868030309677124),
#  ('isi', 0.8843002915382385),
#  ('terrorist', 0.8834922909736633)]

In [0]:
model.wv.save_word2vec_format('tmp.txt', binary=False)
# omly domain based embeddings

2019-11-09 22:30:58,360 : INFO : storing 12387x50 projection weights into tmp.txt
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Transfer Learning

In [0]:
from gensim.models import Word2Vec
sentences = documents
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
word2vec_model = Word2Vec(size = 300, window=5,
min_count = 1, workers = 10)

word2vec_model.build_vocab(sentences)
 
# assign the vectors to the vocabs that are in Google's pre-trained model and your sentences defined above.


2019-10-25 22:19:27,996 : INFO : collecting all words and their counts
2019-10-25 22:19:27,997 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-10-25 22:19:28,029 : INFO : PROGRESS: at sentence #10000, processed 132532 words, keeping 11246 word types
2019-10-25 22:19:28,058 : INFO : PROGRESS: at sentence #20000, processed 262350 words, keeping 16950 word types
2019-10-25 22:19:28,087 : INFO : PROGRESS: at sentence #30000, processed 382207 words, keeping 22182 word types
2019-10-25 22:19:28,118 : INFO : PROGRESS: at sentence #40000, processed 512233 words, keeping 26850 word types
2019-10-25 22:19:28,127 : INFO : collected 27706 word types from a corpus of 541415 raw words and 42364 sentences
2019-10-25 22:19:28,128 : INFO : Loading a fresh vocabulary
2019-10-25 22:19:28,323 : INFO : effective_min_count=1 retains 27706 unique words (100% of original 27706, drops 0)
2019-10-25 22:19:28,324 : INFO : effective_min_count=1 leaves 541415 word corpus (100% of or

In [0]:
# lockf needs to be set to 1.0 to allow continued training.
word2vec_model.intersect_word2vec_format('/content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True)

2019-10-25 22:19:34,849 : INFO : loading projection weights from /content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-10-25 22:21:21,998 : INFO : merged 17587 vectors into (27706, 300) matrix from /content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz


In [0]:
# continue training with you own data
word2vec_model.train(sentences, total_examples=len(sentences), epochs = 5)

2019-10-25 22:21:22,011 : INFO : training model with 10 workers on 27706 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2019-10-25 22:21:23,058 : INFO : EPOCH 1 - PROGRESS: at 81.13% examples, 335791 words/s, in_qsize 10, out_qsize 1
2019-10-25 22:21:23,060 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-10-25 22:21:23,086 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-10-25 22:21:23,111 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-10-25 22:21:23,113 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-10-25 22:21:23,117 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-10-25 22:21:23,146 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-10-25 22:21:23,148 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-10-25 22:21:23,151 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-10

(2121024, 2707075)

In [0]:
w1 = ["bed"]
word2vec_model.wv.most_similar (positive=w1)

2019-10-25 22:25:42,183 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('middleearth', 0.6938994526863098),
 ('room', 0.6501486301422119),
 ('kitchen', 0.6442478895187378),
 ('sleep', 0.6377359628677368),
 ('sleeping', 0.6332756280899048),
 ('couch', 0.629835844039917),
 ('topqbs', 0.6250707507133484),
 ('mouthlol', 0.6173553466796875),
 ('darg', 0.6171203851699829),
 ('catchmeifyoucanim', 0.6152539849281311)]

In [0]:
w1 = ["islam"]
word2vec_model.wv.most_similar (positive=w1)

  if np.issubdtype(vec.dtype, np.int):


[('muslim', 0.8201450705528259),
 ('religion', 0.8187536597251892),
 ('isi', 0.7753992080688477),
 ('islamic', 0.7538473010063171),
 ('quran', 0.7297672033309937),
 ('microbrain', 0.7268235087394714),
 ('christian', 0.721429705619812),
 ('mohammed', 0.7090685367584229),
 ('christianity', 0.6701181530952454),
 ('israel', 0.6650323867797852)]

In [0]:
word2vec_model.wv.similarity(w1="dirty",w2="smelly")

  if np.issubdtype(vec.dtype, np.int):


0.6034551

In [0]:
# word2vec_model.wv.save_word2vec_format('model_transfer_learning.txt', binary=False)
word2vec_model.wv.save_word2vec_format('model_transfer_learning_including_stopwords.txt', binary=False)

2019-10-25 22:26:05,451 : INFO : storing 27706x300 projection weights into model_transfer_learning_including_stopwords.txt
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Training on new word2vec model

In [0]:
import gensim
import logging
from gensim.models import Word2Vec
# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/GoogleNews-vectors-negative300.bin.gz", binary=True)

# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning.txt", binary=False)

# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Hate_Speech_Detection_git/model_transfer_learning_including_stopwords.txt", binary=False)

wv = gensim.models.KeyedVectors.load_word2vec_format("tmp.txt", binary=False) ## only domain based embeddings

wv.init_sims(replace=True)


2019-11-09 22:31:20,332 : INFO : loading projection weights from tmp.txt
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-11-09 22:31:20,897 : INFO : loaded (12387, 50) matrix from tmp.txt
2019-11-09 22:31:20,902 : INFO : precomputing L2-norms of word weight vectors


In [0]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [0]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 42)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['tweet']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklfrom sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_scoreearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [0]:
xgb_w2v = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
lgr_w2v = LogisticRegression(n_jobs=1)
model_training(xgb_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])
# model_training(lgr_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

training accuracy 0.9647512973041387
validation accuracy 0.8591495823842066
validation f1_score 0.8885774351786966
validation roc_auc score 0.9350368582573201
confusion matrix 
 [[2351  581]
 [ 532 4438]]


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1.4266790777602751,
              seed=None, silent=None, subsample=1, verbosity=1)

In [0]:
print('validation accuracy', xgb_w2v.score(X_test_word_average,test_w2v['class']))
print('validation f1_score',f1_score(xgb_w2v.predict(X_test_word_average),test_w2v['class']))

validation accuracy 0.8437539313121147
validation f1_score 0.8771027112606371


In [0]:
# Logistic Regression + Only domain embeddings
# training accuracy 0.8346095430958107
# validation accuracy 0.837129840546697
# validation f1_score 0.8693533651405949
# validation roc_auc score 0.9120879437607912
# confusion matrix 
#  [[2333  599]
#  [ 688 4282]]
# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#                    intercept_scaling=1, l1_ratio=None, max_iter=100,
#                    multi_class='warn', n_jobs=1, penalty='l2',
#                    random_state=None, solver='warn', tol=0.0001, verbose=0,
#                    warm_start=False)


# XGBoost + Only Domain Based Embeddings
# training accuracy 0.9647512973041387
# validation accuracy 0.8591495823842066
# validation f1_score 0.8885774351786966
# validation roc_auc score 0.9350368582573201
# confusion matrix 
#  [[2351  581]
#  [ 532 4438]]
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=1, gamma=0,
#               learning_rate=0.1, max_delta_step=0, max_depth=5,
#               min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
#               nthread=None, objective='binary:logistic', random_state=0,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=1.4266790777602751,
#               seed=None, silent=None, subsample=1, verbosity=1)


In [0]:
confusion_matrix(test_w2v['class'],xgb_w2v.predict(X_test_word_average))

array([[3005,  495],
       [ 574, 4399]])

In [0]:
# array([[3025,  475],
#        [ 540, 4433]])

In [0]:
# logreg = LogisticRegression(n_jobs=1, C=1e5)
# logreg = logreg.fit(X_train_word_average, train_w2v['class'])
# y_pred = logreg.predict(X_test_word_average)

# print('accuracy %s' % accuracy_score(y_pred, test_w2v['class']))
# print(classification_report(test_w2v['class'], y_pred))
# print(confusion_matrix(test_w2v['class'],y_pred))

In [0]:
f1_score(test_w2v['class'],y_pred)
# 0.8864864864864865

In [0]:
y = test_w2v['class']

In [0]:
probabs_df = pd.DataFrame(y)
probabs_df['original_index'] = y.index
probabs_df['probab'] = logreg.predict_proba(X_test_word_average)[::,-1]

probabs_df = probabs_df.sort_values(by = 'probab',ascending=False).reset_index(drop=True)

probabs_df.loc[0:int(X_test_word_average.shape[0]*y.mean()),'preds'] = 1

probabs_df.loc[int(X_test_word_average.shape[0]*y.mean()):,'preds'] = 0

np.mean(probabs_df['class']==probabs_df['preds'])

0.8661631063377788

In [0]:
f1_score(probabs_df['class'],probabs_df['preds'])

0.8859843153026342

In [0]:
confusion_matrix(probabs_df['class'],probabs_df['preds'])

array([[2933,  567],
       [ 567, 4406]])

In [0]:
### its increasing the True positives which is good!! Although we are getting a bit more false alarms
### that is the false positives, but the false negatives are decreased and thats what we need