# Binary Classifier
`sklearn GaussianNB`

In [1]:
# import classifier modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer  

# import other modules
import pandas as pd
import json
from pandas.io.json import json_normalize

In [2]:
with open('REJ_data/all.json', encoding='utf-8-sig') as json_file:
    train = json.load(json_file)
    
ALL = pd.DataFrame.from_dict(json_normalize(train), orient='columns')

In [19]:
reviewed = pd.read_csv('labelled.csv')

In [4]:
df = pd.read_csv('dataset3_with_nlp_techniques.csv')

### 1. TF-IDF 

In [20]:
# make bow of test data
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(use_idf = True)  

x = vectorizer.fit_transform(ALL.comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  


# predict on test data
def binary_class(x):
    p = vectorizer.transform([x]).toarray()
    return g.predict(p)
    

binary_pred = pd.DataFrame({'review' : df.processed_text, 'label' : reviewed.label})
binary_pred['tf-idf'] = df.processed_text.map(lambda x: binary_class(x))


### 2. TF-IDF - stop words

In [21]:
# train on stop words removal
x = vectorizer.fit_transform(ALL.stopwords_removal).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

def binary_class2(x):
    p = vectorizer.transform(x).toarray()
    return g.predict(p)

# make our stopwords col full sentence
df.stopword = df.stopword.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf-stopwords'] = df.stopword.map(lambda x: binary_class(x))

### 3. TF-IDF + Lemmatization

In [22]:
# train on lemmatized removal
x = vectorizer.fit_transform(ALL.lemmatized_comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

### 4. Bigrams

In [23]:
# train on lemmatized removal
x = vectorizer.fit_transform(ALL.lemmatized_comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [24]:
binary_pred['tf-idf'] = binary_pred['tf-idf'].apply(''.join)
binary_pred['tf-idf+lemmatizer'] = binary_pred['tf-idf+lemmatizer'].apply(''.join)
binary_pred['tf-idf-stopwords'] = binary_pred['tf-idf-stopwords'].apply(''.join)
binary_pred

Unnamed: 0,review,label,tf-idf,tf-idf-stopwords,tf-idf+lemmatizer
0,A great game to be during times Nice,Rating,Feature,Feature,Feature
1,I truly love pandora,UserExperience,Feature,Feature,Feature
2,I need all the help I need 5ft and I need to l...,UserExperience,UserExperience,Feature,Feature
3,it s not great but it s good,Rating,Feature,Feature,Feature
4,I think this is cool this is learning and and ...,UserExperience,UserExperience,Feature,Feature
...,...,...,...,...,...
379,Really good for,Rating,Feature,Feature,Feature
380,It is my first time to use this see who it is,UserExperience,Feature,Feature,Feature
381,I love it,Rating,Feature,Feature,Feature
382,fun fact it s fun,Rating,Feature,Feature,Feature


### metrics

In [14]:
from sklearn.metrics import *
import csv


y_true = binary_pred.label
y_pred = binary_pred['tf-idf']

# tf-idf
prec = precision_score(y_true, y_pred, average = "micro")
recall = recall_score(y_true, y_pred, average = "micro")
f = precision_recall_fscore_support(y_true, y_pred, average = "micro")

# tf-idf-stopwords
prec2 = precision_score(y_true, binary_pred['tf-idf-stopwords'], average = "micro")
recall2 = recall_score(y_true, binary_pred['tf-idf-stopwords'], average = "micro")
f2 = precision_recall_fscore_support(y_true, binary_pred['tf-idf-stopwords'], average = "micro")


# tf-idf+lemma
prec3 = precision_score(y_true, binary_pred['tf-idf+lemmatizer'], average = "micro")
recall3 = recall_score(y_true, binary_pred['tf-idf+lemmatizer'], average = "micro")
f3 = precision_recall_fscore_support(y_true, binary_pred['tf-idf+lemmatizer'], average = 'micro')


In [15]:

pd.DataFrame({'technique' : ['tf-idf', 'tf-idf-stopwords', 'tf-idf+lemmatization'], 
              'precision' : [prec, prec2, prec3],
            'recall' : [recall, recall2, recall3],
             'f1_score' : [f, f2, f3]})

Unnamed: 0,technique,precision,recall,f1_score
0,tf-idf,0.210938,0.210938,"(0.2109375, 0.2109375, 0.2109375, None)"
1,tf-idf-stopwords,0.078125,0.078125,"(0.078125, 0.078125, 0.078125, None)"
2,tf-idf+lemmatization,0.078125,0.078125,"(0.078125, 0.078125, 0.078125, None)"


In [29]:
binary_pred.groupby('label').count()['review']

label
Bug                48
Feature            30
Rating            221
UserExperience     85
Name: review, dtype: int64

In [30]:
binary_pred.groupby('tf-idf').count()['review']

tf-idf
Bug                55
Feature           201
Rating             66
UserExperience     62
Name: review, dtype: int64

In [31]:
binary_pred.groupby('tf-idf-stopwords').count()['review']

tf-idf-stopwords
Feature    384
Name: review, dtype: int64

In [33]:
binary_pred.groupby('tf-idf+lemmatizer').count()['review']

tf-idf+lemmatizer
Feature    384
Name: review, dtype: int64