# Collect the Dataset

We use, for example, the dataset present in the article, but if you want to train a model in your dataset, you must use your dataset. Or, if you're going to use the dataset present in the article with another model, you only must change the model

In [None]:
!wget https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Classification/Dataset/RevisoesSoftware.json

--2021-04-23 13:30:25--  https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Classification/Dataset/RevisoesSoftware.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4475705 (4.3M) [text/plain]
Saving to: ‘RevisoesSoftware.json’


2021-04-23 13:30:26 (72.3 MB/s) - ‘RevisoesSoftware.json’ saved [4475705/4475705]



In [None]:
import pandas as pd
import json

with open('RevisoesSoftware.json', 'r') as f:
  data = json.load(f)

df_complete = pd.DataFrame(data)

# Bag-of-Words




## Imports

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords') 
nltk.download('punkt') 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Irrelevant expressions

In [None]:
def list_expressions():
  lista = ['***','****','*****','***great','*bold*','*cough*','*that*','-.-','-^_^-','-_-','-no','...','..now','..quick','.99','.but','.doc','.if','.it','.love','.tri','.when','.wont','////////////////////\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\','///update///','12','===','^-^','^_^','^~^','a++','a+++']
  return lista

## Tokenizer

In [None]:
def tokenize(text):
  lista_exp = list_expressions()
  
  p = re.compile('\d')

  tokens = nltk.word_tokenize(text)

  stems  = []
  for item in tokens:
    auxiliar = 0
    for expressao in lista_exp:
      if item == expressao:
        auxiliar = 1
        break
    if len(item) > 2 and not p.match(item) and auxiliar == 0:  
      stems.append(SnowballStemmer("english").stem(item))
  return stems

## Bag-of-words term weights

In [None]:
stop_words = nltk.corpus.stopwords.words('english') 

dic_tw = {
    'TF' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1)),
    'TF-IDF' : TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1)),
    'Binary' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1), binary=True),
    'TF-Bigram' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2)),
    'TFIDF-Bigram' : TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2)),
    'Binary-Bigram' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2), binary=True)
}

# functions to Train the Model



## import models

In [None]:
from scipy.spatial.distance import cosine
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.svm import SVC as SVM

if you use the KNN its interessant use the metric cosine that is good for text data

In [None]:
def cosseno(x,y):
  dist = cosine(x,y)
  if np.isnan(dist):
   return 1
  return dist

## Algorithms Variation

You can change the algorithms parameters 

In [None]:
algs = {
    "KNN" : KNN(metric=cosseno),
    "MLP" : MLP(),
    "NB" : NB(),
    "MNB" : MNB(alpha=0.4, fit_prior=False),
    "SVM" : SVM()
}

## Define the algorithm that you will use

In [None]:
clf = algs['MNB']

## Train-Test division

First, you must define the train and the test set. *test_size* define the percent of examples of test set, consequently, the train set size is 1 - *test_size*

In [None]:
from sklearn.model_selection import train_test_split

df_train,df_test,y_train_class, y_test_class = train_test_split(df_complete['comment'],df_complete['label'],test_size=0.25, random_state=42)

# Execution

## Pre-processing

In [None]:
vectorizer = dic_tw['TF-IDF']

vectorizer.fit(df_train)

x_train = vectorizer.transform(df_train).toarray()

x_test = vectorizer.transform(df_test).toarray()

  'stop_words.' % sorted(inconsistent))


## Train

In [None]:
clf.fit(x_train,y_train_class)

MultinomialNB(alpha=0.4, class_prior=None, fit_prior=False)

### Saving the model

In [None]:
import pickle

pkl_filename = "pickle_MNB_TFIDF.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file) 

if you want to load the model, use:

with open(pkl_filename, 'rb') as file: \\
    clf = pickle.load(file)

## Test

In [None]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_class, y_pred, output_dict=False))

                precision    recall  f1-score   support

           Bug       0.40      0.60      0.48       109
       Feature       0.37      0.29      0.33        58
        Rating       0.83      0.67      0.74       612
UserExperience       0.33      0.51      0.40       144

      accuracy                           0.61       923
     macro avg       0.48      0.52      0.49       923
  weighted avg       0.67      0.61      0.63       923



# Case Study

In [None]:
texts = ['the app always crashes !!!!!!!!!!', 'I loved this app!!']

In [None]:
def Classification(text):
  bow_test = vectorizer.transform([text]).toarray()
  resp = clf.predict(bow_test)
  print('The text: "' + text + '" belongs to the '+ str(resp[0]).upper() +' class' ) 

In [None]:
for text in texts:
  Classification(text)

The text: "the app always crashes !!!!!!!!!!" belongs to the BUG class
The text: "I loved this app!!" belongs to the RATING class
