In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

In [2]:
data = pd.read_csv("dataset_2.csv")
data_df = pd.DataFrame(data)

In [3]:
data_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,file_id,label,Tweet
0,0,0,12834217_1,noHate,"As of March 13th , 2014 , the booklet had been..."
1,1,1,12834217_2,noHate,In order to help increase the booklets downloa...
2,2,2,12834217_3,noHate,( Simply copy and paste the following text int...
3,3,3,12834217_4,hate,Click below for a FREE download of a colorfull...
4,4,4,12834217_5,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...


In [4]:
# Some useless columns are dropped
data_df = data_df.drop(["file_id", "Unnamed: 0.1"], axis=1)

In [5]:
# Unnamed column is changed to ID
data_df = data_df.rename(columns={"Unnamed: 0": "ID", "label":"Class", "Tweet":"tweet"})

In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 3 columns):
ID       10944 non-null int64
Class    10944 non-null object
tweet    10884 non-null object
dtypes: int64(1), object(2)
memory usage: 256.6+ KB


In [7]:
X = data_df.drop(["Class"], axis=1)
y = data_df.drop(["tweet"], axis=1)


In [8]:
X.head()

Unnamed: 0,ID,tweet
0,0,"As of March 13th , 2014 , the booklet had been..."
1,1,In order to help increase the booklets downloa...
2,2,( Simply copy and paste the following text int...
3,3,Click below for a FREE download of a colorfull...
4,4,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...


In [9]:
y.head()

Unnamed: 0,ID,Class
0,0,noHate
1,1,noHate
2,2,noHate
3,3,hate
4,4,noHate


In [10]:
data_df.tweet = data_df.tweet.fillna("")

In [11]:
"""
@:param documents represents corpus data
@:param stop_word means stop words are included or not. Default value is True
If you want to include stop words you should give False as a parameter
@:param n_gram indicate ngram range of TfidfVectorizer. Default value is 
(1, 1). Its mean unigram. 
(2, 2) --> bigram
(3, 3) --> 3-gram
(4, 4) --> 4-gram
@:return a numpy array
@:details min_df is determined as 0.0005. It ignores vocabulary if it has a
document frequency strictly lower than 0.0005. This value is a threshold value.
"""
def bag_word(documents, stop_word=True, n_gram=(1,1)):

    # If stop words want to be excluded.
    if stop_word:
        if n_gram == (1,1):
            tfidfconverter = TfidfVectorizer(ngram_range=n_gram, stop_words='english')
        else:
            tfidfconverter = TfidfVectorizer(ngram_range=n_gram, min_df=0.0005, stop_words='english')
    
    # If stop words want to be included
    if not stop_word:
        if n_gram == (1, 1):
            tfidfconverter = TfidfVectorizer(ngram_range=n_gram)
        else:
            tfidfconverter = TfidfVectorizer(ngram_range=n_gram, min_df=0.0005)

    X = tfidfconverter.fit_transform(documents).toarray()

    return X

In [12]:
"""
@:param prediction is numpy array which stores predictions
@:param y_test true labels of test data
@:return accuracy value
"""
def get_accuracy(prediction, y_test):
    correct = 0
    total = 0
    for i in range(len(prediction)):
        if prediction[i] == y_test.iloc[i]:
            correct += 1
        total += 1

    return correct / total

In [14]:
acc_list = []
for i in range(1, 3):
    for exp in [True, False]:
        X = data_df['tweet'].values.astype('U')
        y = data_df["Class"].values.astype('U')
        X = bag_word(X, stop_word=exp, n_gram=(i, i))
        clfrNB = GaussianNB()
        scores = cross_val_score(clfrNB, X, y,  cv=10)
        acc = scores.mean()
        acc_list.append(acc)
        if exp:
            print("GaussianNB ", i, "- gram without stopwords : ", acc) 
        else:
            print("GaussianNB ", i, "- gram with stopwords : ", acc)

GaussianNB  1 - gram without stopwords :  0.5943965931576287
GaussianNB  1 - gram with stopwords :  0.5992407084051641
GaussianNB  2 - gram without stopwords :  0.09475511324574573
GaussianNB  2 - gram with stopwords :  0.4405242460786165


In [15]:
acc_list = []
for i in range(1, 3):
    for exp in [True, False]:
        X = data_df['tweet'].values.astype('U')
        y = data_df["Class"].values.astype('U')
        X = bag_word(X, stop_word=exp, n_gram=(i, i))
        clfrNB = MultinomialNB()
        scores = cross_val_score(clfrNB, X, y,  cv=10)
        acc = scores.mean()
        acc_list.append(acc)
        if exp:
            print("MultinomialNB ", i, "- gram without stopwords : ", acc) 
        else:
            print("MultinomialNB ", i, "- gram with stopwords : ", acc)

MultinomialNB  1 - gram without stopwords :  0.86842264455724
MultinomialNB  1 - gram with stopwords :  0.8683314038336587
MultinomialNB  2 - gram without stopwords :  0.8682389097264338
MultinomialNB  2 - gram with stopwords :  0.8685140522354852
