# Collect the Dataset

We use, for example, the dataset present in the article, but if you want to train a model in your dataset, you must use your dataset. Or, if you're going to use the dataset present in the article with another model, you only must change the model

In [1]:
!wget https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Classification/Dataset/RevisoesSoftware.json

--2021-04-23 14:43:27--  https://raw.githubusercontent.com/adailtonaraujo/app_review_analysis/master/Classification/Dataset/RevisoesSoftware.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4475705 (4.3M) [text/plain]
Saving to: ‘RevisoesSoftware.json’


2021-04-23 14:43:28 (20.8 MB/s) - ‘RevisoesSoftware.json’ saved [4475705/4475705]



In [2]:
import pandas as pd
import json

with open('RevisoesSoftware.json', 'r') as f:
  data = json.load(f)

df_complete = pd.DataFrame(data)

# Bag-of-Words




## Imports

In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords') 
nltk.download('punkt') 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Irrelevant expressions

In [4]:
def list_expressions():
  lista = ['***','****','*****','***great','*bold*','*cough*','*that*','-.-','-^_^-','-_-','-no','...','..now','..quick','.99','.but','.doc','.if','.it','.love','.tri','.when','.wont','////////////////////\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\','///update///','12','===','^-^','^_^','^~^','a++','a+++']
  return lista

## Tokenizer

In [5]:
def tokenize(text):
  lista_exp = list_expressions()
  
  p = re.compile('\d')

  tokens = nltk.word_tokenize(text)

  stems  = []
  for item in tokens:
    auxiliar = 0
    for expressao in lista_exp:
      if item == expressao:
        auxiliar = 1
        break
    if len(item) > 2 and not p.match(item) and auxiliar == 0:  
      stems.append(SnowballStemmer("english").stem(item))
  return stems

## Bag-of-words term weights

In [6]:
stop_words = nltk.corpus.stopwords.words('english') 

dic_tw = {
    'TF' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1)),
    'TF-IDF' : TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1)),
    'Binary' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,1), binary=True),
    'TF-Bigram' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2)),
    'TFIDF-Bigram' : TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2)),
    'Binary-Bigram' : CountVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1,2), binary=True)
}

# functions to Train the Model



## import models

In [7]:
from sklearn.svm import OneClassSVM as OCSVM

## Define the algorithm that you will use

In [8]:
clf = OCSVM()

## Train-Test division

First, you must define the train and the test set. *test_size* define the percent of examples of test set, consequently, the train set size is 1 - *test_size*

In [9]:
from sklearn.model_selection import train_test_split

class_interest = 'Rating'

df_train_interest, df_test_interest = train_test_split(df_complete['comment'][df_complete['label'] == class_interest],test_size=0.25, random_state=42)
df_test_outliers = df_complete['comment'][df_complete['label'] != class_interest]

# Execution

## Pre-processing

In [10]:
vectorizer = dic_tw['TF-IDF']

vectorizer.fit(df_train_interest)

x_train = vectorizer.transform(df_train_interest).toarray()

x_test_interest = vectorizer.transform(df_test_interest).toarray()

x_test_outlier = vectorizer.transform(df_test_outliers).toarray()

  'stop_words.' % sorted(inconsistent))


## Train

In [11]:
clf.fit(x_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
            max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False)

### Saving the model

In [None]:
import pickle

pkl_filename = "pickle_OCSVM_TFIDF.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file) 

if you want to load the model, use:

with open(pkl_filename, 'rb') as file: \\
    clf = pickle.load(file)

## Test

In [12]:
y_pred_int = clf.predict(x_test_interest)
y_pred_out = clf.predict(x_test_outlier)

In [13]:
from sklearn.metrics import classification_report

def evaluation_one_class(preds_interest, preds_outliers):
  y_true = [1]*len(preds_interest) + [-1]*len(preds_outliers)
  y_pred = list(preds_interest)+list(preds_outliers)
  return classification_report(y_true, y_pred, output_dict=False)

In [14]:
print(evaluation_one_class(y_pred_int, y_pred_out))

              precision    recall  f1-score   support

          -1       0.75      0.65      0.70      1229
           1       0.45      0.56      0.50       616

    accuracy                           0.62      1845
   macro avg       0.60      0.61      0.60      1845
weighted avg       0.65      0.62      0.63      1845



# Case Study

In [15]:
texts = ['the app always crashes !!!!!!!!!!', 'I loved this app!!']

In [20]:
def Classification(text):
  bow_test = vectorizer.transform([text]).toarray()
  resp = clf.predict(bow_test)
  if resp[0] == 1:
    print('The text: "' + text + '" BELONGS to the class of interest!') 
  if resp[0] == -1:
    print('The text: "' + text + '" DOES NOT belong to the class of interest!') 

In [21]:
for text in texts:
  Classification(text)

The text: "the app always crashes !!!!!!!!!!" DOES NOT belong to the class of interest!
The text: "I loved this app!!" BELONGS to the class of interest!
