In [14]:
!pip install emoji



In [15]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import emoji
import spacy
from operator import itemgetter
import itertools
from itertools import combinations


from nltk.stem.snowball import SnowballStemmer


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from tqdm import tqdm
tqdm.pandas()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Stopwords and spacy model
* We extract a list of stopwords from - https://gist.github.com/sebleier/554280 because the available list of NLTK/Spacy stopwords are highly insufficient and incomplete.
* We extract the Spacy's english pipeline optimized for CPU and will be using it for stemming and lemmatization tasks. In case the spacy's model is not installed just type "python -m spacy download en_core_web_sm" on terminal to get the model installed for preprocessing tasks
* We also instantiate SnowballStemmer to perform stemming after lemmatization is done in later stages of the notebook

In [16]:
#### Stopwords list taken from - https://gist.github.com/sebleier/554280 (reason - the available list of NLTK/ Spacy stopwords are too less and don't cover all possible words)
stopwords_list = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz",]

nlp        = spacy.load("en_core_web_sm")

stemmer = SnowballStemmer(language='english')

## Custom preprocessing functions for Vocabulary extraction
We declare basic functions to perform tasks like removal of stopwords, stemming, lemmatization in batches, removal and emojis etc. 

In [17]:
def remove_stopwords(text):
    '''
      Removes stopwords from the text
    '''

    text_split   = text.split()

    text_list    = [word for word in text_split if not word in stopwords_list]

    return ' '.join(text_list)

def stemming_perform(text):
    '''
      Performs lemmatization in batches
    '''
    
    token = text.split()
    stemmed_text = []
    for tok in token:
        stemmed_text.append(stemmer.stem(tok))
    return ' '.join(stemmed_text)


def cleaning_text(text):
    '''
      Operations performed:- 
      1. Converting the entire text to lowercase
      2. Removal of punctuations from the text
      3. Removal of numbers from the text
    '''

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = remove_stopwords(text)

    text = text.replace('amp','') ## this was actually ampersand which kept coming in frequently that made no sense
    return text


def give_emoji_free_text(text):
    ''' 
      Removes all possible emojis from the text (because our text is basically tweets that can have emoijs).
      The input is a text and the output is emoji free text
    '''
    allchars = [str for str in text.encode().decode('utf-8')]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.encode().decode('utf-8').split() if not any(i in str for i in emoji_list)])
    return clean_text


def preprocess_custom(text):
    text = give_emoji_free_text(text)
    text = cleaning_text(text)

    return text

def lemmatize_batches(docs):
    '''
      Performs lemmatization in batches
    '''
    lemmatized_text = []

    for doc in nlp.pipe(docs, batch_size=256, n_process=3,disable=["parser", "ner"]):
        lemmatized_text.append(str(' '.join([token.lemma_ for token in doc])))
    return lemmatized_text


def lemmatize_df(df):
    lemmatized = []
    for i in tqdm(range(0,df.shape[0],1000)):
        z = lemmatize_batches(df['preprocessed_text'].iloc[i:i+1000])
        lemmatized.extend(z)
    df['lemmatize'] = lemmatized
    return df



def return_frequency_dict_sorted(df):
    sent = df['stemmed'].values
    sent = ' '.join(sent)

    fdist = FreqDist()
    
    for word in word_tokenize(sent):
        fdist[word.lower()] += 1
        
    return sorted(fdist.items(), key=lambda x: x[1], reverse=True)

def replace(x1,x2):
    if str(x1) == 'nan':
        return x2
    else:
        return x1


In [18]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Loading the best result obtained from SOTA models
We load the output from the previously trained SOTA model <br>
We will no subset the output on just two classes where heavy imbalance was observed - sexual violence and Physical violence. <br>
We will create a dataset from where we will draw inferences

In [19]:
subdf   = pd.read_csv('/content/gdrive/MyDrive/Hackathon/bert_base_best_result.csv')

subdf   = subdf[(subdf['type']=='sexual_violence') | (subdf['type']=='Physical_violence')]

print(f'The shape of subsetted data with subsetting on physical and sexual violence will be {subdf.shape}')

test_df = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Test.csv')


temp_df = subdf.merge(test_df,on=['Tweet_ID'],how='left')

_       = temp_df.rename(columns={'type':'type1'},inplace=True)

print(f'The shape of data after assigning the labels attained from BERT to test is {temp_df.shape}')

The shape of subsetted data with subsetting on physical and sexual violence will be (11324, 2)
The shape of data after assigning the labels attained from BERT to test is (11324, 3)


## Text Preprocessing for Vocab extraction
We preprocess the test data tweets and then find the frequency occurance of the preprocessed words from the dataset. We will now select top 20 words from the test data tweets which will be used as a vocabulary for building our model.

In [20]:
temp_df['preprocessed_text'] = temp_df['tweet'].progress_apply(preprocess_custom)
temp_df                      = lemmatize_df(temp_df)
temp_df['stemmed']           = temp_df['lemmatize'].progress_apply(stemming_perform)
sorted_freq_dict             = return_frequency_dict_sorted(temp_df)

threshold2consider           = 20
vocab_sex_phy                = [word[0] for word in sorted_freq_dict[:threshold2consider]]


100%|██████████| 11324/11324 [00:06<00:00, 1700.28it/s]
100%|██████████| 12/12 [00:17<00:00,  1.44s/it]
100%|██████████| 11324/11324 [00:02<00:00, 4396.46it/s]


In [21]:
print('The top 20 words (preprocessed) based on the frequency distribution and their relative frequencies are given below')
pd.DataFrame(sorted_freq_dict,columns = ['word','Frequency']).head(20)

The top 20 words (preprocessed) based on the frequency distribution and their relative frequencies are given below


Unnamed: 0,word,Frequency
0,rape,8659
1,boyfriend,3917
2,stab,3399
3,knife,3275
4,student,1862
5,year,1786
6,woman,1748
7,univers,1642
8,sex,1503
9,man,1493


# Classification model on the vocabulary attained
We will now preprocess our train and BERT Classified physical and sexual data to finetune it <br>
This will be to make sure that the distribution of train and test dataset is same and noise is removed in terms of redundant features

## Text Preprocessing on Train and Test data
We subset our train data having only Physical Violence and Sexual Violence labels and perform the exact same preprocessing steps we did for the test data above. We will subset the test data on the predictions previously obtained from BERT model.

In [22]:
test_df                         = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Test.csv')
subdf                           = pd.read_csv('/content/gdrive/MyDrive/Hackathon/bert_base_best_result.csv')

test_df                         = test_df.merge(subdf,on=['Tweet_ID'],how='left')


subdf                           = subdf[(subdf['type']=='sexual_violence') | (subdf['type']=='Physical_violence')]


temp_df                         = subdf.merge(test_df,on=['Tweet_ID'],how='left')

_                               = temp_df.rename(columns={'type':'type1'},inplace=True)

temp_df['preprocessed_text']    = temp_df['tweet'].progress_apply(preprocess_custom)
temp_df                         = lemmatize_df(temp_df)
temp_df['stemmed']              = temp_df['lemmatize'].progress_apply(stemming_perform)


train_df                        = pd.read_csv('/content/gdrive/MyDrive/Hackathon/Train.csv')
train_data                      = train_df[train_df['type'].isin(['Physical_violence','sexual_violence'])]
train_data                      = train_data.reset_index()

train_data['preprocessed_text'] = train_data['tweet'].progress_apply(preprocess_custom)
train_data                      = lemmatize_df(train_data)
train_data['stemmed']           = train_data['lemmatize'].progress_apply(stemming_perform)

print(test_df.shape,train_df.shape,temp_df.shape)

100%|██████████| 11324/11324 [00:06<00:00, 1698.94it/s]
100%|██████████| 12/12 [00:11<00:00,  1.01it/s]
100%|██████████| 11324/11324 [00:02<00:00, 4365.87it/s]
100%|██████████| 38594/38594 [00:24<00:00, 1594.03it/s]
100%|██████████| 39/39 [00:48<00:00,  1.25s/it]
100%|██████████| 38594/38594 [00:08<00:00, 4569.47it/s]

(15581, 3) (39650, 3) (11324, 7)





## Count vectorization with binarization
We apply count vectorizer on the train and test datasets 

In [23]:
train_X              = train_data['stemmed'].values
test_X               = temp_df['stemmed'].values

cv                   = CountVectorizer(vocabulary = vocab_sex_phy,binary=True)

train                = cv.fit_transform(train_X)
test                 = cv.transform(test_X)

#### Converting the sparse matrices to dense 
train                = train.todense()
test                 = test.todense()
print(f'The shape of preprocessed train data matrix is {train.shape} and preprocessed test data matrix is {test.shape}')

### label encoding done here
train_data['labels'] = train_data['type'].map({'sexual_violence':0,'Physical_violence':1})

train_y              = train_data['labels'].values

The shape of preprocessed train data matrix is (38594, 20) and preprocessed test data matrix is (11324, 20)


## Fitting a model on preprocessed data and finetuning
* We fit a XGBClassifier model on the preprocessed data 
* We will now update the previous labels of sexual_violence and Physical_violence with the outcomes attained from the model 
* The outcomes will be used for second stage classification

In [24]:
from xgboost import XGBClassifier
model      = XGBClassifier(n_estimators =10,random_state=13,depth = 4,scale_pos_weight=np.sqrt(32648/5946))
_          = model.fit(train,train_y)

In [25]:
prob_thresh     = 0.45
predicted_probs = model.predict_proba(test)[:,1]
accuracy_preds  = np.where(predicted_probs>prob_thresh,1,0)

In [26]:

temp_df['type2'] = accuracy_preds
print(temp_df['type2'].value_counts())


## Inverse label encoding 
temp_df['type2'] = temp_df['type2'].map({0:'sexual_violence',1:'Physical_violence'})
temp_df['prob']  = predicted_probs

findf            = test_df.merge(temp_df,on = ['Tweet_ID','tweet'],how='left')


0    7946
1    3378
Name: type2, dtype: int64


In [27]:
### We update the previous classes with the updated ones 
findf['fintype'] = findf.apply(lambda z: replace(z['type2'],z['type']),axis=1)
findf['flag']    = (findf['type']!=findf['fintype']).astype(int)

print(f'Ensuring the shape of outcome dataframe is maintained, we have shape of output data as - {findf.shape}')

Ensuring the shape of outcome dataframe is maintained, we have shape of output data as - (15581, 12)


## Reviewing some modifications 
We will see some mismatches done by BERT model that were modified in this stage of supervised classification based on machine learning 

In [28]:
mismatch = findf[findf['flag']==1][['Tweet_ID','tweet','type','fintype','prob']]
mismatch = mismatch.sort_values(by = ['prob'])
mismatch.head(20)

Unnamed: 0,Tweet_ID,tweet,type,fintype,prob
667,ID_W8WXZCYC,When you find out your two year old was raped ...,Physical_violence,sexual_violence,0.186706
9020,ID_LEQOA1SN,When you find out your two year old was raped ...,Physical_violence,sexual_violence,0.186706
4766,ID_Z881MYLO,FB Repost When you find out your two year ol...,Physical_violence,sexual_violence,0.186706
9580,ID_VNYFXLFW,Women like to be raped👙 Only By their husband...,Physical_violence,sexual_violence,0.186706
13002,ID_MYRRZNB4,my husband made me drink last night. And then ...,Physical_violence,sexual_violence,0.223479
7788,ID_IQZOJNG5,Told him yesterday if my husband forced me to ...,Physical_violence,sexual_violence,0.223479
5411,ID_FW7ZISE6,I ran away from my parents home to have sex wi...,Physical_violence,sexual_violence,0.223479
5331,ID_PV67S5BH,“I've carried a knife a few times. Was it wort...,sexual_violence,Physical_violence,0.685884
5313,ID_RJE9DIYT,And 1st year of law school i “locked myself ou...,sexual_violence,Physical_violence,0.685884
5260,ID_DFQIEWCU,I had a friend in highschool. He stabbed me wi...,sexual_violence,Physical_violence,0.685884


In [29]:
final_out = findf[['Tweet_ID','fintype']]
final_out = final_out.rename({'fintype':'type'},axis=1)
final_out.head(3)

Unnamed: 0,Tweet_ID,type
0,ID_D9ONL553,sexual_violence
1,ID_263YTILY,emotional_violence
2,ID_62VS6IXC,emotional_violence


In [30]:
final_out['type'].value_counts()

sexual_violence                 7946
Physical_violence               3378
Harmful_Traditional_practice    3100
emotional_violence               659
economic_violence                498
Name: type, dtype: int64

In [32]:
final_out.to_csv('outcome_stage1.csv',index=False)