# Collect the Dataset

We use, for example, the dataset present in the article, but if you want to train a model in your dataset, you must use your dataset. Or, if you're going to use the dataset present in the article with another model, you only must change the model

In [1]:
!wget https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Sentiment/Datasets/Dataset_Sentiment_BoW.csv

--2021-05-08 00:39:14--  https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Sentiment/Datasets/Dataset_Sentiment_BoW.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180750 (177K) [text/plain]
Saving to: ‘Dataset_Sentiment_BoW.csv’


2021-05-08 00:39:14 (16.0 MB/s) - ‘Dataset_Sentiment_BoW.csv’ saved [180750/180750]



In [2]:
import pandas as pd

df_complete = pd.read_csv("Dataset_Sentiment_BoW.csv")
df_complete[df_complete['class']=='Negative']

Unnamed: 0,text,class,feature,appName
3,Too many ads and secondly erratic interface,Negative,interface,PhotoEditor
24,There are so many ads popping up that the app ...,Negative,ad pops up,PhotoEditor
32,"Horrible editor, worst app in this entire store",Negative,editor,PhotoEditor
38,I gave this app a 5 stars cause rate me pls al...,Negative,rate me pls always pops out,PhotoEditor
50,Have to download everything and adverts are an...,Negative,adverts,PhotoEditor
...,...,...,...,...
1391,It doesnt let you pick the song u want to list...,Negative,shuffle,Spotify
1413,Only problem I have with the app is the inabil...,Negative,download music to an SD card,Spotify
1420,"Cmon everythings on shuffle, random music pops...",Negative,shuffle,Spotify
1421,"Cmon everythings on shuffle, random music pops...",Negative,random music pops up,Spotify


In [None]:
appList=classes=list(df_complete['appName'].unique())
appList

['PhotoEditor',
 'Evernote',
 'eBay',
 'WhatsApp',
 'Netflix',
 'Twitter',
 'Facebook',
 'Spotify']

In [None]:
classes=list(df_complete['class'].unique())
classes

['Neutral', 'Negative', 'Positive']

# Bag-of-Words




## Imports

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords') 
nltk.download('punkt') 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Tokenizer

In [None]:
stop_words = nltk.corpus.stopwords.words('english') 
  
def tokenize(text):
  
  p = re.compile('\d')

  tokens = nltk.word_tokenize(text)

  stems  = []
  for item in tokens:
    if len(item) > 2 and not p.match(item):  
      stems.append(SnowballStemmer("english").stem(item))
  return stems

# functions to Train the Model



## import models

In [None]:
from scipy.spatial.distance import cosine
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.svm import SVC as SVM

if you use the KNN its interessant use the metric cosine that is good for text data

In [None]:
def cosseno(x,y):
  dist = cosine(x,y)
  if np.isnan(dist):
   return 1
  return dist

## Algorithms Variation

You can change the algorithms parameters 

In [None]:
algs = {
    "KNN" : KNN(metric=cosseno),
    "MLP" : MLP(),
    "NB" : NB(),
    "MNB" : MNB(alpha=0.4, fit_prior=False),
    "SVM" : SVM()
}

#Pre-Processing

In [None]:
#Matrix TF-IDF
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1))
Mtfidf = tfidf.fit_transform(df_complete['text'])
vocab_apps=tfidf.vocabulary_ # Vocabulary with the position of each term in the matrix
tfidf=Mtfidf.toarray() # TF-IDF term weight matrix

#Adjusting weight of tokens that are aspectocs in TF-IDF arrays
# Strategy: tokem who is aspcto = original TF-IDF + max (TF-IDF of the document's tokens)

for index, row in df_complete.iterrows():
  feature_list=tokenize(row[2]) #tokeniza
  feature_list=[w for w in feature_list if not w in stop_words] #remove stopwords
  max_document=max(tfidf[index])
  for i in range(0,len(feature_list)):
    if feature_list[i]=='phptp': continue #only 1 occurrence where the token was not for vocabulary and treatment in the feature did not eliminate
    pos_feature=vocab_apps[feature_list[i]] # get the position of the aspect token in BoW 
    tfidf[index][pos_feature]=tfidf[index][pos_feature]+max_document
tfidf.shape

  'stop_words.' % sorted(inconsistent))


(1429, 1462)

In [None]:
#Grouping the data by app
train_label_apps={}
test_label_apps={}

train_tfidf_apps={}
test_tfidf_apps={}

for app in appList:
  train_label_apps[app]=list()
  test_label_apps[app]=list()
  train_tfidf_apps[app]=list()
  test_tfidf_apps[app]=list()
  
for app in appList:
  for index, row in df_complete.iterrows():
    if row[3]!=app: #Train (except tested app)
      train_tfidf_apps[app].append(tfidf[index])
      train_label_apps[app].append(row[1])
    if row[3]==app: #Test 
      test_tfidf_apps[app].append(tfidf[index])
      test_label_apps[app].append(row[1])

## Define the algorithm that you will use and the app test

In [None]:
clf = algs['MLP']
appTest = 'Spotify'

## Train-Test division

First, you must define the train and the test set. *test_size* define the percent of examples of test set, consequently, the train set size is 1 - *test_size*

In [None]:
x_train = train_tfidf_apps[appTest]
y_train_class=train_label_apps[appTest]

x_test = test_tfidf_apps[appTest]
y_test_class=test_label_apps[appTest]


# Execution

## Train

In [None]:
clf.fit(x_train,y_train_class)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

### Saving the model

In [None]:
import pickle

pkl_filename = "pickle_MLP_Spotify.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file) 

if you want to load the model, use:

with open(pkl_filename, 'rb') as file: \\
    clf = pickle.load(file)

## Test

In [None]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_class, y_pred, output_dict=False))

              precision    recall  f1-score   support

    Negative       0.36      0.20      0.26        25
     Neutral       0.79      0.78      0.78       120
    Positive       0.49      0.71      0.58        31

    accuracy                           0.68       176
   macro avg       0.55      0.56      0.54       176
weighted avg       0.68      0.68      0.67       176



# Case Study

In [None]:
texts = ['Horrible editor, worst app in this entire store','Too many ads and secondly erratic interface', 'I loved this app!!']

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1))
vectorizer.fit(df_complete['text'])
def Classification(text):
  bow_test = vectorizer.transform([text]).toarray()
  resp = clf.predict(bow_test)
  print('The text: "' + text + '" belongs to the '+ str(resp[0]).upper() +' class' )

  'stop_words.' % sorted(inconsistent))


In [None]:
for text in texts:
  Classification(text)

The text: "Horrible editor, worst app in this entire store" belongs to the NEGATIVE class
The text: "Too many ads and secondly erratic interface" belongs to the NEUTRAL class
The text: "I loved this app!!" belongs to the POSITIVE class
