###### FOR INTERNAL REFERENCE ONLY (To be deleted before submission):
input files:
    1. combined_data.csv

output files:
    1. sampdf_feat.csv

In [1]:
# importing libraries

import pandas as pd
import numpy as np

import nltk
import re
from nltk.corpus import stopwords
from string import punctuation
from nltk import stem
from nltk.stem import SnowballStemmer

In [2]:
# defining the functions

def to_lower(text):
    result = ' '.join([w.lower() for w in nltk.word_tokenize(text)])
    return result

def remove_tags(text):
    result = re.sub('<[^<]+?>','', text)
    return result

def remove_numeric(text):
    result = ''.join(c for c in text if not c.isdigit())
    return result

def remove_escape_sequences(text):
    result = " ".join(text.split())
    return result

def decontract(result):
    result = re.sub(r"n\'t", " not", result)
    result = re.sub(r"\'re", " are", result)
    result = re.sub(r"\'s", " is", result)
    result = re.sub(r"\'d", " would", result)
    result = re.sub(r"\'ll", " will", result)
    result = re.sub(r"\'t", " not", result)
    result = re.sub(r"\'ve", " have", result)
    result = re.sub(r"\'m", " am", result)
    return result

def remove_stop_words(text):
    stop = stopwords.words('english')
    
    words = text.split(' ')
    new_words = [word for word in words if word not in stop]
    result = ' '.join(word for word in new_words)
    return result

def remove_url(text):
    result = re.sub('(www|http)\S+', '', text)
    return result

def strip_punctuation(text):
    result = ''.join(c for c in text if c not in punctuation)
    return result

def remove_special_characters(text):
    result = re.sub('[^A-Za-z0-9!]+', ' ', text)
    return result

def perform_lemmatization(text):
    wnl = stem.wordnet.WordNetLemmatizer()
    word_tokens = nltk.word_tokenize(text)
    result = [wnl.lemmatize(word, pos='v') for word in word_tokens]
    result_words = ' '.join(result)
    return result_words

def perform_stemming(text):
    stopword = stopwords.words('english')
    snowball_stemmer = SnowballStemmer('english')
    word_tokens = nltk.word_tokenize(text)
    result = [snowball_stemmer.stem(word) for word in word_tokens]
    result_words = ' '.join(result)
    return result_words

def extract_adjectives(text):
    word_tokens = nltk.word_tokenize(text)
    result = nltk.pos_tag(word_tokens)
    adjective_tags = ["JJ", "JJR", "JJS"]
    adj_list = [a[0] for a in result if a[1] in adjective_tags]
    adj_words = ' '.join(adj_list)
    return adj_words

# function that accepts a DataFrame and adds new features
def make_features(df):
    df['word_count'] = df['cleaned_comment_text'].apply(lambda x : len(x.split())) # count ! as one word because our cleaning creates a whitespace between ! and the character infront of it
    df['char_count'] = df['cleaned_comment_text'].apply(lambda x : len(x.replace(" ","")))# count ! as one char
    df['word_density'] = df['word_count'] / (df['char_count'] + 1) # to see if long words or short words are used
    df['total_length'] = df['cleaned_comment_text'].apply(len) #count the number of char + whitespaces
    df['capitals'] = df['cleaned_comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1) # should we use total length here or char_count?
    df['num_exclamation_marks'] =df['cleaned_comment_text'].apply(lambda x: x.count('!'))
    df['num_unique_words'] = df['cleaned_comment_text'].apply(lambda x: len(set(w for w in x.split()))) # rhis is the example we saw where fuck appears many times
    df['words_vs_unique'] = df['num_unique_words'] / df['word_count'] # if u write fuck all the way, this value will be very low
    #df['num_symbols'] = df['cleaned_comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%#@')) #this is interesting, maybe we shd not clean this away since some people might use symbols to mask the toxic words but the intended msg is still toxic
    #df["word_unique_percent"] =  df["num_unique_words"]*100/df['word_count']
    #df['punc_count'] = df['cleaned_comment_text'].apply(lambda x : len([a for a in x if a in punc]))
    #df['num_question_marks'] = df['comment_text'].apply(lambda x: x.count('?'))
    #df['num_punctuation'] = df['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    return df

In [3]:
# reading the data

df = pd.read_csv('combined_data.csv', encoding='latin-1')
print(df.shape)
display(df.head())

(223549, 10)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,label_count
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,0


Since the labels are highly imbalanced and 90% of the data is clean, we will downsample the data.

In [4]:
# undersampling of clean comments

# number of non-clean records
no_clean = len(df[df['clean'] == 0])

# indices of clean records
clean_indices = df[df['clean'] == 1].index

# random sample clean records
np.random.seed(2019)
random_clean_indices = np.random.choice(clean_indices, no_clean+10000, replace=False)

# indices of non-clean records
no_clean_indices = df[df['clean'] == 0].index

# concatenate clean and non-clean indices
under_sample_indices = np.concatenate([random_clean_indices, no_clean_indices])

# getting balanced data
samp_df = df.loc[under_sample_indices]

# check number of records
print("Size before sampling:", len(df))
print("Size after sampling:", len(samp_df))

Size before sampling: 223549
Size after sampling: 54936


In [5]:
# check distributions before sampling

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'clean']
total = len(df)

for col in cols:
    print("\nLabel : {}".format(col))
    col_count = len(df[df[col]==1])
    print("Percentage : {} %".format(round(100*col_count/total, 2)))
    print("Value Counts : \n{}".format(df[col].value_counts()))


Label : toxic
Percentage : 9.57 %
Value Counts : 
0    202165
1     21384
Name: toxic, dtype: int64

Label : severe_toxic
Percentage : 0.88 %
Value Counts : 
0    221587
1      1962
Name: severe_toxic, dtype: int64

Label : obscene
Percentage : 5.43 %
Value Counts : 
0    211409
1     12140
Name: obscene, dtype: int64

Label : threat
Percentage : 0.31 %
Value Counts : 
0    222860
1       689
Name: threat, dtype: int64

Label : insult
Percentage : 5.06 %
Value Counts : 
0    212245
1     11304
Name: insult, dtype: int64

Label : identity_hate
Percentage : 0.95 %
Value Counts : 
0    221432
1      2117
Name: identity_hate, dtype: int64

Label : clean
Percentage : 89.95 %
Value Counts : 
1    201081
0     22468
Name: clean, dtype: int64


In [6]:
# check distributions after sampling

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'clean']
total = len(samp_df)

for col in cols:
    print("\nLabel : {}".format(col))
    col_count = len(samp_df[samp_df[col]==1])
    print("Percentage : {} %".format(round(100*col_count/total, 2)))
    print("Value Counts : \n{}".format(samp_df[col].value_counts()))


Label : toxic
Percentage : 38.93 %
Value Counts : 
0    33552
1    21384
Name: toxic, dtype: int64

Label : severe_toxic
Percentage : 3.57 %
Value Counts : 
0    52974
1     1962
Name: severe_toxic, dtype: int64

Label : obscene
Percentage : 22.1 %
Value Counts : 
0    42796
1    12140
Name: obscene, dtype: int64

Label : threat
Percentage : 1.25 %
Value Counts : 
0    54247
1      689
Name: threat, dtype: int64

Label : insult
Percentage : 20.58 %
Value Counts : 
0    43632
1    11304
Name: insult, dtype: int64

Label : identity_hate
Percentage : 3.85 %
Value Counts : 
0    52819
1     2117
Name: identity_hate, dtype: int64

Label : clean
Percentage : 59.1 %
Value Counts : 
1    32468
0    22468
Name: clean, dtype: int64


##### OBSERVATIONS:<br>
    The ratio of clean:toxic comments is now roughly 60:40.

In [7]:
# clean the comment text

for index, row in samp_df.iterrows():
    text = str(row['comment_text'])
    text = remove_url(text)
    #text = to_lower(text)    
    text = decontract(text)
    text = remove_stop_words(text)
    text = remove_escape_sequences(text)
    #text = strip_punctuation(text)
    text = remove_tags(text)
    text = remove_numeric(text)
    text = remove_special_characters(text)
    text = perform_lemmatization(text)
    #text = spell_check(text)
    
    samp_df.at[index, 'cleaned_comment_text'] = text
    
samp_df = samp_df[~samp_df['cleaned_comment_text'].isnull()]
samp_df = samp_df[samp_df['cleaned_comment_text'] != '']

print(samp_df.shape)
display(samp_df.head())

(54870, 11)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,label_count,cleaned_comment_text
22705,3bf7c95e20e164f1,"you do interesting work! \n\nReally, whatever ...",0,0,0,0,0,0,1,0,interest work ! Really whatever outcome Aether...
195108,8dfbb26d7edb4e39,Let me see if I understand you. Because the si...,0,0,0,0,0,0,1,0,Let see I understand you Because site run reli...
88427,ec8eb2974a3b7686,"Yes, I will try to cut out the details to make...",0,0,0,0,0,0,1,0,Yes I try cut detail make compact
112237,587075cbd150aee1,"timestamp to unsigned templates, so you do it ...",0,0,0,0,0,0,1,0,timestamp unsigned templates me Thanks ! c
81072,d8d76d16e8369ff8,I replaced the quote as to my count three cont...,0,0,0,0,0,0,1,0,I replace quote count three contributers seem ...


In [8]:
# add some SLF (Sentence Level Features)

make_features(samp_df)

print(samp_df.shape)
display(samp_df.head())
display(samp_df.tail())

(54870, 20)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,label_count,cleaned_comment_text,word_count,char_count,word_density,total_length,capitals,caps_vs_length,num_exclamation_marks,num_unique_words,words_vs_unique
22705,3bf7c95e20e164f1,"you do interesting work! \n\nReally, whatever ...",0,0,0,0,0,0,1,0,interest work ! Really whatever outcome Aether...,15,93,0.159574,107,3,0.028037,1,14,0.933333
195108,8dfbb26d7edb4e39,Let me see if I understand you. Because the si...,0,0,0,0,0,0,1,0,Let see I understand you Because site run reli...,22,118,0.184874,139,6,0.043165,0,20,0.909091
88427,ec8eb2974a3b7686,"Yes, I will try to cut out the details to make...",0,0,0,0,0,0,1,0,Yes I try cut detail make compact,7,27,0.25,33,2,0.060606,0,7,1.0
112237,587075cbd150aee1,"timestamp to unsigned templates, so you do it ...",0,0,0,0,0,0,1,0,timestamp unsigned templates me Thanks ! c,7,36,0.189189,42,1,0.02381,1,7,1.0
81072,d8d76d16e8369ff8,I replaced the quote as to my count three cont...,0,0,0,0,0,0,1,0,I replace quote count three contributers seem ...,31,179,0.172222,209,2,0.009569,0,30,0.967742


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,label_count,cleaned_comment_text,word_count,char_count,word_density,total_length,capitals,caps_vs_length,num_exclamation_marks,num_unique_words,words_vs_unique
223448,ff91c3d8a3e34398,NIGEL IS A CRAZY IDIOT!!!,1,0,0,0,1,0,0,2,NIGEL IS A CRAZY IDIOT ! ! !,8,21,0.363636,28,18,0.642857,3,6,0.75
223506,ffd49b8defd069d0,""" \n ::Well, now don't I feel stupid.... ÃÂ...",0,0,0,0,1,0,0,1,Well I feel stupid,4,15,0.25,18,2,0.111111,0,4,1.0
223516,ffdf6854b41d9102,==Fourth Baldrick possibly being cleverer than...,1,0,0,0,0,0,0,1,Fourth Baldrick possibly cleverer make out Doe...,22,125,0.174603,146,6,0.041096,0,19,0.863636
223533,ffebe90c8d5acaba,""" \n\n == IRAN == \n ThatÃ¢ÂÂs right, Iran. ...",1,0,1,0,0,0,0,2,IRAN That s right Iran It drone And spread hom...,208,1131,0.183746,1338,223,0.166667,1,168,0.807692
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0,0,3,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,27,109,0.245455,135,109,0.807407,0,25,0.925926


In [9]:
# export df to csv

samp_df.to_csv('sampdf_feat.csv', index=False)