# Implementing Naive Bayes

In [24]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC

In [25]:
plt.style.use('ggplot')

In [26]:
data = pd.read_csv('../Data/reddit_training.csv')

In [27]:
data['sarcasm'] = data['sarcasm_tag'].map({'yes' : 1, 'no' : 0})

In [28]:
data[data.sarcasm_tag == 'yes'].head()

Unnamed: 0,index,body,author,created_utc,subreddit_id,link_id,parent_id,score,id,subreddit,sarcasm_tag,sarcasm
32,3821,&gt; It's just not. \n\nWow way to really cont...,ropeadoped,1441425716,t5_2qh6e,t3_3jp4fr,t1_cur6sdt,-4,cur83kh,television,yes,1
116,3822,Maybe you shouldn't be in a thread about the b...,Damn_Dog_Inappropes,1441486521,t5_2qh6e,t3_3jpljt,t1_curjpl4,4,curu8ht,television,yes,1
121,3824,FUCK!! FUCK!! FUCK!!! FUUCCCCCCCCK!!! BOMBOBLO...,Kush_Daz,1441491179,t5_2qh6e,t3_3jpljt,t1_curqbij,0,curwkwl,television,yes,1
126,3823,OH MY GOD I AM SO SORRY!!!! FUCK! I feel so ba...,Joename_,1441498254,t5_2qh6e,t3_3jpljt,t1_curwkwl,2,cus02ym,television,yes,1
161,3825,You mean piracy?,PicopicoEMD,1441583828,t5_2qh6e,t3_3jw7qx,t3_3jw7qx,-1,cusy92j,television,yes,1


In [29]:
tfid = TfidfVectorizer(stop_words=stopwords.words('english'), token_pattern=r'[%s]+' % string.ascii_letters)

In [30]:
tf_idf_array = tfid.fit_transform(data['body']).toarray()

In [31]:
tf_idf_array = pd.DataFrame(data=tf_idf_array, columns=tfid.get_feature_names())

In [32]:
tf_idf_array.shape

(2694, 7519)

In [33]:
tf_idf_array.to_csv('tf-idf.csv', index=False)

# Loading Data from memory

In [34]:
tf_idf = pd.read_csv('tf-idf.csv')

In [35]:
tf_idf.head()

Unnamed: 0,aaaaaaiiiiiiiii,aaand,aaawwww,aang,aaron,abandoned,abbout,abc,abduction,abilities,...,zap,zbzct,zenith,zero,zippity,zoey,zombie,zone,zoo,zucluyhxsec
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Spliting Data 

In [37]:
X_train = tf_idf.ix[:2000, :]
X_test = tf_idf.ix[2000 :, :]
y_train = data.ix[:2000, 'sarcasm']
y_test = data.ix[2000 : , 'sarcasm']

In [38]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: sarcasm, dtype: int64

# Multinomial Naive Byes

In [39]:
%time Mnb = MultinomialNB().fit(X_train, y_train)

Wall time: 3.89 s


In [40]:
preds = Mnb.predict(X_test)

In [41]:
accuracy_score(y_test, preds)

0.97838616714697402

# Bernoulli Naive Byes

In [42]:
%time Bnb = BernoulliNB().fit(X_train, y_train)

Wall time: 466 ms


In [43]:
preds = Bnb.predict(X_test)
accuracy_score(y_test, preds)

0.9610951008645533

# SVM

In [46]:
svm = SVC(probability=True)

In [47]:
%time svm.fit(X_train, y_train)

Wall time: 14.4 s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
preds = svm.predict(X_test)

In [49]:
accuracy_score(y_test, preds)

0.97838616714697402