# Count Vectorizer & TF-IDF using Multinomial Naive Bayes 

In [19]:
import os
import glob

import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score

%matplotlib inline

In [6]:
data_path = 'data'

files = glob.glob(os.path.join(data_path, '*', '*'))

files

In [10]:
d_train_a = pd.read_csv(files[1])

d_train_b = pd.read_csv(files[-2])

In [14]:
d_train_a.head()

Unnamed: 0,RES_ID,RESPONSE,LABEL
0,TRA1,intetraksi/beradaptasi terhadap lingkungan yan...,1
1,TRA2,seperti jatuhnya meteor tsunami gempa bumi,0
2,TRA3,hanya tuhan yang tahu tantangan nya itu apaan,0
3,TRA4,mereka akan sulit beradaptasi,1
4,TRA5,"Tempat tinggal, ekonomi, dan pekerjaan",1


In [13]:
d_train_b.head()

Unnamed: 0,RES_ID,RESPONSE,LABEL
0,TRB1,Karena orang berpikir bahwa jika disumbangkan ...,1
1,TRB2,Pakaian awet adalah alternatif terhadap Fast F...,0
2,TRB3,karna orang lebih suka menyumbang,1
3,TRB4,karana harga nya terjangkau dan pas.,0
4,TRB5,kerena harganya mahal .,0


In [18]:
d_train = d_train_a.append(d_train_b)

d_train.reset_index(drop = True, inplace = True)

## cleansing

In [21]:
def cleansing(sentence):
    sentence = sentence.lower()
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if len(word) > 2]
    sentence = " ".join(word_list)
    
    return sentence

In [22]:
d_train['response_cleansing'] = d_train.RESPONSE.apply(cleansing)

## feature extraction

In [24]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [26]:
X_cv = cv.fit_transform(d_train.response_cleansing)
X_tfidf = tfidf.fit_transform(d_train.response_cleansing)

## modeling

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_cv, d_train.LABEL, test_size = 0.2, shuffle = True, random_state = 123)

In [34]:
mnb = MultinomialNB()

mnb.fit(X_train, y_train)

res_cv = mnb.predict(X_test)

In [39]:
print("f1 score", f1_score(y_test, res_cv))
print("precision", precision_score(y_test, res_cv))
print("recall", recall_score(y_test, res_cv))

f1 score 0.7898089171974523
precision 0.7469879518072289
recall 0.8378378378378378


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, d_train.LABEL, test_size = 0.2, shuffle = True, random_state = 123)

In [41]:
mnb = MultinomialNB()

mnb.fit(X_train, y_train)

res_cv = mnb.predict(X_test)

In [42]:
print("f1 score", f1_score(y_test, res_cv))
print("precision", precision_score(y_test, res_cv))
print("recall", recall_score(y_test, res_cv))

f1 score 0.7865168539325843
precision 0.6730769230769231
recall 0.9459459459459459


In [55]:
def evaluation(y_true, y_pred):
    f1score = f1_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    return {'f1score': f1score, 'precision': prec, 'recall': recall}

In [56]:
kf = KFold(n_splits=5)

In [57]:
score = []
for train, test in kf.split(X_cv, d_train.LABEL):
    X_train, y_train = X_cv[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X_cv[test], d_train.loc[test, 'LABEL']
    
    print(X_test.shape, y_test.shape)
    
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    res = mnb.predict(X_test)
    
    score.append(evaluation(y_test, res))
    
    print("success")

(115, 1117) (115,)
success
(115, 1117) (115,)
success
(115, 1117) (115,)
success
(114, 1117) (114,)
success
(114, 1117) (114,)
success


In [58]:
pd.DataFrame(score)

Unnamed: 0,f1score,precision,recall
0,0.83908,0.776596,0.9125
1,0.849462,0.79,0.918605
2,0.839161,0.821918,0.857143
3,0.661157,0.615385,0.714286
4,0.677419,0.736842,0.626866


In [59]:
score = []
for train, test in kf.split(X_tfidf, d_train.LABEL):
    X_train, y_train = X_tfidf[train], d_train.loc[train, 'LABEL']
    X_test, y_test = X_tfidf[test], d_train.loc[test, 'LABEL']
    
    print(X_test.shape, y_test.shape)
    
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    res = mnb.predict(X_test)
    
    score.append(evaluation(y_test, res))
    
    print("success")

(115, 1117) (115,)
success
(115, 1117) (115,)
success
(115, 1117) (115,)
success
(114, 1117) (114,)
success
(114, 1117) (114,)
success


In [60]:
pd.DataFrame(score)

Unnamed: 0,f1score,precision,recall
0,0.829016,0.707965,1.0
1,0.851282,0.761468,0.965116
2,0.767442,0.647059,0.942857
3,0.66242,0.514851,0.928571
4,0.765432,0.652632,0.925373
