## A NLP problem, Where tweets are labeled as either hate speech or Not a hate speech.

- The tweets are cleaned using certain regular expression functions such as 'Sub' and custom functions to extract features from the tweets.

- Then the modeling is done using a simple model of Gaussian Naive Bayes and Accuracy score was used as the merics to evaluate the model.

- The dataset consists of 3 columns : ID, Tweets & Labels. Labels are binary defining whether the tweet is hate speech/Not.

In [None]:
import pandas as pd #Importing packages pandas and numpy
import numpy as np


In [None]:
tweet_data = pd.read_csv('final_dataset_basicmlmodel.csv') #Importing the dataset.....
tweet_data.head(5)                                         #Top 5 rows in the dataframe.....

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
for index,tweet in enumerate(tweet_data.tweet[1:6]):              #using enumerate to list the first 6 tweets from the dataset......
  print(index+1," ",tweet)

1   @user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked
2     bihday your majesty
3   #model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  
4    factsguide: society now    #motivation
5   [2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  


In [None]:
import re    #Importing regular expressions package....

In [None]:
import re

#Clean text from noise
def clean_text(text):
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [None]:
tweet_data['clean_text'] = tweet_data.tweet.apply(lambda x: clean_text(x))  #Using apply functionality in pandas to apply the custom function to tweet column.....

In [None]:
tweet_data.head()

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [None]:
#Custom list of stop words in english.....

STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']



In [None]:
# To generate the frequency/Count of each words used in the dataset....
def gen_frequency(text):
  word_list = []

  for tw_words in text.split():
    word_list.extend(tw_words)

    word_freq = pd.Series(word_list).value_counts()
    word_freq = word_freq.drop(STOP_WORDS,errors = 'ignore')

    return word_freq





In [None]:
#To find if any negation is present in the tweet...
def any_neg(words):
  
  for word in words:
    if word in ['no','not','n','non'] or re.search(r"\wn't", word):
      return 1
    else:
        return 0 

In [None]:
#To find if any rare words cropped up in the tweets using the least used words derived using the frequuency genration function...

def any_rare(words,rare_100):

  for word in words:
    if word in rare_100:
      return 1
    else:
      return 0

In [None]:
#To find if the tweet had any question asked...

def is_question(words):

  for word in words:
    if word in ['when','where','who','what','how']:
      return 1
    else:
      return 0

  

In [None]:
#Applying the frequency genaator function to the cleaned tweet...

words_freq = gen_frequency(tweet_data.clean_text.str)

In [None]:
#Pulling out the rare words based using the frequency genartion function....


rare_100 = words_freq[-100:] #Least used or the rarest word used in the cleaned tweets.

In [None]:
#Appplyng the functions using the lambda function and adding new features....

tweet_data['word count'] = tweet_data.clean_text.str.split().apply(lambda x:len(x)) #for counting words
tweet_data['Negation'] = tweet_data.clean_text.str.split().apply(lambda x : any_neg(x))
tweet_data['Rare words'] = tweet_data.clean_text.str.split().apply(lambda x : any_rare(x,rare_100))
tweet_data['Is it a question'] = tweet_data.clean_text.str.split().apply(lambda x : is_question(x))
tweet_data['Character num'] = tweet_data.clean_text.apply(lambda x : len(x))        # For counting characters....

In [None]:
#The new appended dataset....

tweet_data.head() 

Unnamed: 0,id,label,tweet,clean_text,word count,Negation,Rare words,Is it a question,Character num
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,1,0,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,0,1,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,86
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39


In [None]:
tweet_data.columns

Index(['id', 'label', 'tweet', 'clean_text', 'word count', 'Negation',
       'Rare words', 'Is it a question', 'Character num'],
      dtype='object')

## Modeling :

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = tweet_data[['word count', 'Negation','Rare words', 'Is it a question', 'Character num']]
y = tweet_data.label

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
predictions  = model.predict(X_test)

In [None]:
pd.crosstab(predictions,y_test)

label,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,430,233
1,178,208


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(predictions,y_test)

0.6081982840800763