In [1]:
import pandas as pd

dataset = pd.read_csv("final_dataset_basicmlmodel.csv")
dataset.head()

#label is the column that contains the target variable or the value that has to be predicted. 
#1 means it's a hate speech and 0 means it is not.

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [2]:
dataset.label.unique()

array([0, 1], dtype=int64)

In [3]:
for index,tweet in enumerate(dataset["tweet"][10:15]):
    print(index,".",tweet)

0 .  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
1 . we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
2 . i get to see my daddy today!!   #80days #gettingfed
3 . ouch...junior is angryð#got7 #junior #yugyoem   #omg 
4 . i am thankful for having a paner. #thankful #positive     


#### Lot of noise in the dataset and weired symbols in 

In [4]:
import re

def clean_text(text):
    #Filter to allow only texts
    text = re.sub(r'[^a-zA-Z\']',' ',text)
    
    #Remove Unicode Characters
    text = re.sub(r'[^\x00-\x7F]+','',text)
    
    #convert to lower case
    text = text.lower()
    
    return text
    

In [5]:
dataset['clean_text'] = dataset.tweet.apply(lambda x:clean_text(x))

In [6]:
dataset

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now motivation
...,...,...,...,...
5237,31935,1,lady banned from kentucky mall. @user #jcpenn...,lady banned from kentucky mall user jcpenn...
5238,31947,1,@user omfg i'm offended! i'm a mailbox and i'...,user omfg i'm offended i'm a mailbox and i'...
5239,31948,1,@user @user you don't have the balls to hashta...,user user you don't have the balls to hashta...
5240,31949,1,"makes you ask yourself, who am i? then am i a...",makes you ask yourself who am i then am i a...


## Feature Engineering

In [7]:
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

In [8]:
#Let's generate word frequency
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(STOP_WORDS, errors='ignore')
    
    return word_freq

In [9]:
##Check whether a negation term is present in the text

def any_neg(words):
    for word in words:
        if word in ['n','no','not','non'] or re.search(r"\wn't",word):
                return 1
        else:
                return 0
                                                       
#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0                                                       

In [10]:
word_freq = gen_freq(dataset.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
dataset['word_count'] = dataset.clean_text.str.split().apply(lambda x: len(x))
#Negation present or not
dataset['any_neg'] = dataset.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
dataset['is_question'] = dataset.clean_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
dataset['any_rare'] = dataset.clean_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
dataset['char_count'] = dataset.clean_text.apply(lambda x: len(x))

In [11]:
#top 10 common words are
gen_freq(dataset.clean_text.str)[:10]

user      3351
amp        439
love       320
day        254
trump      214
happy      207
will       191
people     186
new        171
u          158
dtype: int64

In [12]:
dataset.head()

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,is_question,any_rare,char_count
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,1,0,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,0,0,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,86
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39


In [13]:
# split the data 
from sklearn.model_selection import train_test_split

x = dataset[['word_count','any_neg','is_question','any_rare','char_count']]
y = dataset.label

In [14]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=27)

#### Let's train model 

In [15]:
# We are using naivebayes classifier

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [16]:
#let's train on train dataset
model = model.fit(X_train,y_train)

In [17]:
#predicting on test dataset
predict = model.predict(X_test)

#### Let's evaluate the model

In [18]:
from sklearn.metrics import accuracy_score

print("Accuracy:",accuracy_score(y_test,predict)*100,"%")

Accuracy: 58.857142857142854 %


In [19]:
from sklearn import metrics
import numpy as np
print('MAE:', metrics.mean_absolute_error(y_test, predict))
print('MSE:', metrics.mean_squared_error(y_test, predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predict)))

MAE: 0.4114285714285714
MSE: 0.4114285714285714
RMSE: 0.6414269805898185
