# Binary Classifier
`sklearn GaussianNB`

In [47]:
# import classifier modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer  

# import other modules
import pandas as pd
import json
from pandas.io.json import json_normalize

In [48]:
with open('data/REJ_data/all.json', encoding='utf-8-sig') as json_file:
    train = json.load(json_file)
    
ALL = pd.DataFrame.from_dict(json_normalize(train), orient='columns')

In [49]:
reviewed = pd.read_csv('data/labelled.csv')

In [50]:
df = pd.read_csv('data/dataset3_with_nlp_techniques.csv')

### 1. TF-IDF 

In [51]:
# make bow of test data
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(use_idf = True)

x = vectorizer.fit_transform(ALL.comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  


# predict on test data
def binary_class(x):
    p = vectorizer.transform([x]).toarray()
    return g.predict(p)
    

binary_pred = pd.DataFrame({'review' : df.processed_text, 'label' : reviewed.label})
binary_pred['tf-idf'] = df.processed_text.map(lambda x: binary_class(x))


### 2. TF-IDF - stop words

In [52]:
# train on stop words removal
x = vectorizer.fit_transform(ALL.stopwords_removal).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

def binary_class2(x):
    p = vectorizer.transform(x).toarray()
    return g.predict(p)

# make our stopwords col full sentence
df.stopword = df.stopword.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf-stopwords'] = df.stopword.map(lambda x: binary_class(x))

### 3. TF-IDF + Lemmatization

In [53]:
# train on lemmatized removal
x = vectorizer.fit_transform(ALL.lemmatized_comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

### 4. Bigrams

In [54]:
# train on lemmatized removal
x = vectorizer.fit_transform(ALL.lemmatized_comment).toarray()
y = ALL.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
binary_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [55]:
binary_pred['tf-idf'] = binary_pred['tf-idf'].apply(''.join)
binary_pred['tf-idf+lemmatizer'] = binary_pred['tf-idf+lemmatizer'].apply(''.join)
binary_pred['tf-idf-stopwords'] = binary_pred['tf-idf-stopwords'].apply(''.join)
binary_pred

Unnamed: 0,review,label,tf-idf,tf-idf-stopwords,tf-idf+lemmatizer
0,A great game to be during times Nice,Rating,Feature,Feature,Feature
1,I truly love pandora,UserExperience,Feature,Feature,Feature
2,I need all the help I need 5ft and I need to l...,UserExperience,UserExperience,Feature,Feature
3,it s not great but it s good,Rating,Feature,Feature,Feature
4,I think this is cool this is learning and and ...,UserExperience,UserExperience,Feature,Feature
...,...,...,...,...,...
379,Really good for,Rating,Feature,Feature,Feature
380,It is my first time to use this see who it is,UserExperience,Feature,Feature,Feature
381,I love it,Rating,Feature,Feature,Feature
382,fun fact it s fun,Rating,Feature,Feature,Feature


### metrics

In [68]:
from sklearn.metrics import *
import csv


y_true = binary_pred.label
y_pred = binary_pred['tf-idf']

# tf-idf
f = precision_recall_fscore_support(y_true, y_pred, average = "macro")

# tf-idf-stopwords
f2 = precision_recall_fscore_support(y_true, binary_pred['tf-idf-stopwords'], average = "macro")


# tf-idf+lemma
f3 = precision_recall_fscore_support(y_true, binary_pred['tf-idf+lemmatizer'], average = "macro")




In [62]:

pd.DataFrame({'technique' : ['tf-idf', 'tf-idf-stopwords', 'tf-idf+lemmatization'], 
              'metric (prec, recall, f)' : [f, f2, f3]})

Unnamed: 0,technique,"metric (prec, recall, f)"
0,tf-idf,"(0.30156037991858886, 0.2892486802413273, 0.24..."
1,tf-idf-stopwords,"(0.01953125, 0.25, 0.036231884057971016, None)"
2,tf-idf+lemmatization,"(0.01953125, 0.25, 0.036231884057971016, None)"


In [69]:
f

(0.30156037991858886, 0.2892486802413273, 0.24562124162949572, None)

In [12]:
binary_pred.groupby('label').count()['review']

label
Bug                48
Feature            30
Rating            221
UserExperience     85
Name: review, dtype: int64

In [13]:
binary_pred.groupby('tf-idf').count()['review']

tf-idf
Bug                55
Feature           201
Rating             66
UserExperience     62
Name: review, dtype: int64

In [14]:
binary_pred.groupby('tf-idf-stopwords').count()['review']

tf-idf-stopwords
Feature    384
Name: review, dtype: int64

In [15]:
binary_pred.groupby('tf-idf+lemmatizer').count()['review']

tf-idf+lemmatizer
Feature    384
Name: review, dtype: int64

---

# Repeat for Bug 

In [18]:
with open('data/REJ_data/Bug_tt.json', encoding='utf-8-sig') as json_file:
    train = json.load(json_file)
    
BUG = pd.DataFrame.from_dict(json_normalize(train), orient='columns')

In [31]:
# make bow of test data
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(use_idf = True)

x = vectorizer.fit_transform(BUG.comment).toarray()
y = BUG.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  


# predict on test data
def binary_class(x):
    p = vectorizer.transform([x]).toarray()
    return g.predict(p)
    

bug_pred = pd.DataFrame({'review' : df.processed_text, 'label' : reviewed.label})
bug_pred['tf-idf'] = df.processed_text.map(lambda x: binary_class(x))



In [32]:
# train on stop words removal
x = vectorizer.fit_transform(BUG.stopwords_removal).toarray()
y = BUG.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

def binary_class2(x):
    p = vectorizer.transform(x).toarray()
    return g.predict(p)

# make our stopwords col full sentence
df.stopword = df.stopword.map(lambda x: ' '.join(x))

# predict
bug_pred['tf-idf-stopwords'] = df.stopword.map(lambda x: binary_class(x))

In [33]:
# train on lemmatized removal
x = vectorizer.fit_transform(BUG.lemmatized_comment).toarray()
y = BUG.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
bug_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [34]:
# train on lemmatized removal
x = vectorizer.fit_transform(BUG.lemmatized_comment).toarray()
y = BUG.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
bug_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [35]:
bug_pred['tf-idf'] = bug_pred['tf-idf'].apply(''.join)
bug_pred['tf-idf+lemmatizer'] = bug_pred['tf-idf+lemmatizer'].apply(''.join)
bug_pred['tf-idf-stopwords'] = bug_pred['tf-idf-stopwords'].apply(''.join)
bug_pred

Unnamed: 0,review,label,tf-idf,tf-idf-stopwords,tf-idf+lemmatizer
0,A great game to be during times Nice,Rating,Not_Bug,Not_Bug,Not_Bug
1,I truly love pandora,UserExperience,Not_Bug,Not_Bug,Not_Bug
2,I need all the help I need 5ft and I need to l...,UserExperience,Bug,Not_Bug,Not_Bug
3,it s not great but it s good,Rating,Not_Bug,Not_Bug,Not_Bug
4,I think this is cool this is learning and and ...,UserExperience,Not_Bug,Not_Bug,Not_Bug
...,...,...,...,...,...
379,Really good for,Rating,Not_Bug,Not_Bug,Not_Bug
380,It is my first time to use this see who it is,UserExperience,Not_Bug,Not_Bug,Not_Bug
381,I love it,Rating,Not_Bug,Not_Bug,Not_Bug
382,fun fact it s fun,Rating,Not_Bug,Not_Bug,Not_Bug


In [66]:

y_true = bug_pred.label
y_pred = bug_pred['tf-idf']


# tf-idf
f = precision_recall_fscore_support(y_true, y_pred, average = "macro")

# tf-idf-stopwords
f2 = precision_recall_fscore_support(y_true, binary_pred['tf-idf-stopwords'], average = "macro")


# tf-idf+lemma
f3 = precision_recall_fscore_support(y_true, binary_pred['tf-idf+lemmatizer'], average = "macro")




pd.DataFrame({'technique' : ['tf-idf', 'tf-idf-stopwords', 'tf-idf+lemmatization'], 
              'metric (prec, recall, f)' : [f, f2, f3]})

Unnamed: 0,technique,"metric (prec, recall, f)"
0,tf-idf,"(0.04, 0.10416666666666667, 0.0578034682080924..."
1,tf-idf-stopwords,"(0.01953125, 0.25, 0.036231884057971016, None)"
2,tf-idf+lemmatization,"(0.01953125, 0.25, 0.036231884057971016, None)"


---

# Repeat for Feature

In [37]:
with open('data/REJ_data/Feature_tt.json', encoding='utf-8-sig') as json_file:
    train = json.load(json_file)
    
FEAT = pd.DataFrame.from_dict(json_normalize(train), orient='columns')

In [38]:
# make bow of test data
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = TfidfVectorizer(use_idf = True)

x = vectorizer.fit_transform(FEAT.comment).toarray()
y = FEAT.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  


# predict on test data
def binary_class(x):
    p = vectorizer.transform([x]).toarray()
    return g.predict(p)
    

feat_pred = pd.DataFrame({'review' : df.processed_text, 'label' : reviewed.label})
feat_pred['tf-idf'] = df.processed_text.map(lambda x: binary_class(x))



In [39]:
# train on stop words removal
x = vectorizer.fit_transform(FEAT.stopwords_removal).toarray()
y = FEAT.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

def binary_class2(x):
    p = vectorizer.transform(x).toarray()
    return g.predict(p)

# make our stopwords col full sentence
df.stopword = df.stopword.map(lambda x: ' '.join(x))

# predict
feat_pred['tf-idf-stopwords'] = df.stopword.map(lambda x: binary_class(x))

In [40]:
# train on lemmatized removal
x = vectorizer.fit_transform(FEAT.lemmatized_comment).toarray()
y = FEAT.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
feat_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [34]:
# train on lemmatized removal
x = vectorizer.fit_transform(FEAT.lemmatized_comment).toarray()
y = FEAT.label

# train the classifier
g = GaussianNB()  
g = g.fit(x, y)  

# make our lemmatizer col full sentence
df.lemmatizer = df.lemmatizer.map(lambda x: ' '.join(x))

# predict
feat_pred['tf-idf+lemmatizer'] = df.lemmatizer.map(lambda x: binary_class(x))

In [41]:
feat_pred['tf-idf'] = feat_pred['tf-idf'].apply(''.join)
feat_pred['tf-idf+lemmatizer'] = feat_pred['tf-idf+lemmatizer'].apply(''.join)
feat_pred['tf-idf-stopwords'] = feat_pred['tf-idf-stopwords'].apply(''.join)
feat_pred

Unnamed: 0,review,label,tf-idf,tf-idf-stopwords,tf-idf+lemmatizer
0,A great game to be during times Nice,Rating,Feature,Not_Feature,Not_Feature
1,I truly love pandora,UserExperience,Feature,Not_Feature,Not_Feature
2,I need all the help I need 5ft and I need to l...,UserExperience,Not_Feature,Not_Feature,Not_Feature
3,it s not great but it s good,Rating,Not_Feature,Not_Feature,Not_Feature
4,I think this is cool this is learning and and ...,UserExperience,Feature,Not_Feature,Not_Feature
...,...,...,...,...,...
379,Really good for,Rating,Not_Feature,Not_Feature,Not_Feature
380,It is my first time to use this see who it is,UserExperience,Feature,Not_Feature,Not_Feature
381,I love it,Rating,Not_Feature,Not_Feature,Not_Feature
382,fun fact it s fun,Rating,Not_Feature,Not_Feature,Not_Feature


In [64]:

y_true = feat_pred.label
y_pred = feat_pred['tf-idf']



# tf-idf
f = precision_recall_fscore_support(y_true, y_pred, average = "macro")

# tf-idf-stopwords
f2 = precision_recall_fscore_support(y_true, binary_pred['tf-idf-stopwords'], average = "macro")


# tf-idf+lemma
f3 = precision_recall_fscore_support(y_true, binary_pred['tf-idf+lemmatizer'], average = "macro")




pd.DataFrame({'technique' : ['tf-idf', 'tf-idf-stopwords', 'tf-idf+lemmatization'], 
              'metric (prec, recall, f)' : [f, f2, f3]})

Unnamed: 0,technique,"metric (prec, recall, f)"
0,tf-idf,"(0.02857142857142857, 0.13999999999999999, 0.0..."
1,tf-idf-stopwords,"(0.01953125, 0.25, 0.036231884057971016, None)"
2,tf-idf+lemmatization,"(0.01953125, 0.25, 0.036231884057971016, None)"
