# Natural Language processing: Hate speech classification

In [1]:
import pandas as pd
import string
import re
pd.options.display.max_colwidth = 300 # to display full tweets as each tweet is of 280 chars

In [2]:
df = pd.read_csv('cyberbullying_tweets.csv')
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was crapilicious! #mkr",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",not_cyberbullying
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",not_cyberbullying


In [3]:
df.iloc[10:20]['tweet_text']

10                                                                                                          @Jord_Is_Dead http://t.co/UsQInYW5Gn
11                                                                                              The Bully flushes on KD http://twitvid.com/A2TNP
12                                                                                                                                   Ughhhh #MKR
13    RT @Kurdsnews: Turkish state has killed 241 children in last 11 years http://t.co/JlvkE1epws  #news  ##GoogleÇeviriciTopluluğuKürtçeyideE…
14                         Love that the best response to the hotcakes they managed to film was a non-committal "meh" from some adolescent. #MKR
15                                                               @yasmimcaci @Bferrarii PAREM DE FAZER BULLYING COMIGO =( UHAHUAH BANDO DE PRETO
16                                                                       @sarinhacoral @Victor_Maggi tadinhu de mim , sofrendo bul

In [4]:
def text_cleaner(tweet):
    tweet = re.sub(r"(@[A-Za-z0–9_]+)|[^\w\s]|#|http\S+", "", tweet) # removes twitter user, links, nonwords
    tweet = re.sub(r"\W|\d", " ", tweet) 
    tweet = re.sub(r"[ ]{2,}", " ", tweet)  # removes occurances of more than one white space between words
    return tweet
    

In [5]:
df["tweet_text"]=df.tweet_text.apply(lambda x:text_cleaner(x))

In [6]:
df.iloc[10:20]['tweet_text']

10                                                                                                                    
11                                                                                            The Bully flushes on KD 
12                                                                                                          Ughhhh MKR
13                          RT Turkish state has killed children in last years news GoogleÇeviriciTopluluğuKürtçeyideE
14    Love that the best response to the hotcakes they managed to film was a noncommittal meh from some adolescent MKR
15                                                               PAREM DE FAZER BULLYING COMIGO UHAHUAH BANDO DE PRETO
16                                                                            tadinhu de mim sofrendo bulling viu MIMI
17                                                        dea twitter is basically the angry letters of our generation
18                     Best pick up line Hi your

We can still see that there are some non ASCII characters e.g., "GoogleÇeviriciTopluluğuKürtçeyid" we can apply 
a lamada function that eliminates characters that are not in standard ASCII

In [7]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

We can apply a lambada function that excludes any character that is not in the above list 
at the same time we can convert all letters to small letters by passing by applying .lower() when passing
    the tween to non

In [8]:
def non_ascii_filter(tweet):
    return ''.join(filter(lambda x: x in set(string.printable), tweet))
df["tweet_text"]=df.tweet_text.apply(lambda x: non_ascii_filter(x.lower()))


In [9]:
df.iloc[10:20]['tweet_text']

10                                                                                                                    
11                                                                                            the bully flushes on kd 
12                                                                                                          ughhhh mkr
13                              rt turkish state has killed children in last years news googleeviricitopluluukrteyidee
14    love that the best response to the hotcakes they managed to film was a noncommittal meh from some adolescent mkr
15                                                               parem de fazer bullying comigo uhahuah bando de preto
16                                                                            tadinhu de mim sofrendo bulling viu mimi
17                                                        dea twitter is basically the angry letters of our generation
18                     best pick up line hi your

In [None]:
df.describe()