# Predicting the IMDb rating of Arrow (American TV Serie) through scripts
## From the site: https://subslikescript.com/ 
## by: Anna Luiza Gomes

In [6]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV # para o grid search
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler # para padronização
from sklearn.preprocessing import MinMaxScaler # para normalização
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline # para o pipeline

In [7]:
# configuração para não exibir os warnings

import warnings
warnings.filterwarnings("ignore")

# ignorando future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [14]:
dataset = pd.read_csv('Data/dataset.csv', sep=";")

In [16]:
dataset.drop('Unnamed: 0', axis=1, inplace=True)

In [18]:
dataset.head()

Unnamed: 0,nome_ep,Episodios,text_processed,0,1,2,nota_imdb,qtd_reviews,Topic
0,/series/Arrow-2193021/season-1/episode-1-Pilot,The name of the islandthey found me on is Lian...,the name of the islandthey found me on is lian...,0.352093,0.646065,,8.5,7572.0,1.0
1,/series/Arrow-2193021/season-1/episode-2-Honor...,[PANTING][GRUNTS][SPEAKING IN FOREIGN LANGUAGE...,[panting][grunts][speaking in foreign language...,0.027911,0.970653,,8.2,5731.0,1.0
2,/series/Arrow-2193021/season-1/episode-3-Lone_...,"My name is Oliver Queen.To my family, I am the...",my name is oliver queen to my family i am the...,0.349652,0.648876,,8.4,5367.0,1.0
3,/series/Arrow-2193021/season-1/episode-4-An_In...,"My name is Oliver Queen.For 5 years, I was str...",my name is oliver queen for 5 years i was str...,0.197529,0.197696,0.604775,8.3,5071.0,2.0
4,/series/Arrow-2193021/season-1/episode-5-Damaged,"OLIVER:My name is Oliver Queen.For five years,...",oliver:my name is oliver queen for five years ...,0.020486,0.971107,,8.6,5011.0,1.0


In [17]:
dataset.columns

Index(['nome_ep', 'Episodios', 'text_processed', '0', '1', '2', 'nota_imdb',
       'qtd_reviews', 'Topic'],
      dtype='object')

## Splitting the dataset in test and train

In [24]:
# separando os atributos e a classe do dataset
array = dataset.values
X = array[:,3:6]
Y = array[:,6]

# definindo o tamanho do conjunto de teste
test_size = 0.20 

# A semente (seed) pode ser qualquer número, e garante que os resultados possam ser reproduzidos de forma idêntica toda vez que o script for rodado. 
# Isto é muito importante quando trabalhamos com modelos ou métodos que utilizam de algum tipo de aleatoriedade.
seed = 7

# particionando em conjuntos de treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [26]:
Y

array([8.5, 8.2, 8.4, 8.3, 8.6, 7.9, 8.2, 8.1, 8.4, 7.8, 8.1, 8.4, 8.6,
       8.8, 8.0, 8.7, 8.1, 8.3, 8.2, 8.5, 8.8, 9.1, 9.5, 8.7, 8.5, 8.7,
       8.6, 8.8, 8.4, 8.5, 9.1, 9.3, 8.3, 8.6, 8.6, 8.7, 8.8, 9.2, 8.5,
       8.2, 9.0, 9.3, 9.0, 9.2, 9.5, 8.9, 8.5, 8.2, 8.4, 8.1, 8.0, 9.2,
       9.6, 8.5, 8.0, 8.5, 8.3, 8.4, 8.8, 8.3, 8.4, 8.9, 9.0, 8.7, 8.5,
       8.8, 8.2, 8.3, 8.0, 8.0, 8.2, 8.8, 7.9, 8.0, 9.0, 8.8, 7.9, 7.8,
       8.1, 8.3, 7.7, 8.1, 6.1, 6.6, 7.1, 6.6, 7.4, 7.1, 7.0, 5.5, 8.3,
       8.2, 8.1, 8.1, 8.6, 8.3, 8.3, 9.1, 9.2, 7.8, 8.2, 8.1, 7.4, 7.8,
       8.5, 9.0, 9.2, 8.5, 7.9, 7.7, 8.2, 9.1, 9.7, 7.9, 8.0, 7.8, 7.4,
       8.7, 8.4, 7.8, 8.7, 7.3, 7.5, 7.7, 7.9, 8.2, 7.3, 8.1, 8.0, 7.9,
       8.7, 8.0, 8.0, 8.9, 8.5, 8.8, 9.1, 8.0, 8.3, 8.2, 8.6, 8.3, 9.4,
       8.4, 9.0, 7.4, 7.8, 7.6, 7.7, 8.1, 7.4, 7.7, 8.5, 8.2, 8.7, 9.1,
       8.4, 8.7, 8.8, 8.6, 9.0, 8.9, 8.7, 7.1, 9.4, nan], dtype=object)

In [28]:
# definindo a métrica de avaliação dos algoritmos
scoring = 'accuracy'
# scoring = 'roc_auc'

kfold = KFold(n_splits=10)

## KNN


In [29]:
model = KNeighborsClassifier()
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

nan


In [30]:
# criação de outro modelo alterando o tipo de distância
model = KNeighborsClassifier(metric = 'manhattan')
print(model.get_params)
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

<bound method BaseEstimator.get_params of KNeighborsClassifier(metric='manhattan')>
nan


## Decision Tree

In [None]:
model = DecisionTreeClassifier(random_state=7)
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

In [None]:
# criando o modelo já com os parâmetros desejados
model = DecisionTreeClassifier(max_depth = None, # profundidade máxima - inteiro ou "None" (sem limite). Valores muito altos podem causar overviffing e muito baixos, underfitting.
                             criterion = 'entropy',
                             random_state=7) # critério de medição de qualidade de um particionamento {“gini”, “entropy”}, default=”gini”.

# imprimindo os parâmetros do modelo
print(model.get_params)

results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

## Naive Bayes

In [None]:
model = GaussianNB()
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

## Suport Vector Machine

In [None]:
model = SVC(gamma='auto', kernel = 'linear')
results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print(results.mean())

## Comparing model performance

In [None]:
# repetindo o código acima

# separando os atributos e a classe do dataset
array = dataset.values
X = array[:,0:8]
Y = array[:,8]

# definindo o tamanho do conjunto de teste
test_size = 0.20 

# A semente (seed) pode ser qualquer número, e garante que os resultados possam ser reproduzidos de forma idêntica toda vez que o script for rodado. 
# Isto é muito importante quando trabalhamos com modelos ou métodos que utilizam de algum tipo de aleatoriedade.
seed = 7

# particionando em conjuntos de treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
np.random.seed(7) # definindo uma seed global

# preparando os modelos
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# avaliando um modelo por vez
results = []
names = []
scoring = 'accuracy'
folds = 10

for name, model in models:
  kfold = KFold(n_splits=folds)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
  print(msg)

# boxplot de comparação dos algoritmos
fig = plt.figure() 
fig.suptitle('Comparação da Acurácia dos Algoritmos') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show()

## Comparing model performance with pipeline

In [None]:
np.random.seed(7) # definindo uma seed global

# Pipelines para padronização do dataset e aplicação do modelo
pipelines = []
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC(gamma='auto'))])))

results = []
names = []

# Aplicação dos pipelines
for name, model in pipelines:
  kfold = KFold(n_splits=folds)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

# boxplot de comparação dos algoritmos
fig = plt.figure() 
fig.suptitle('Comparação da Acurácia dos Algoritmos - StandardScaler') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show()

In [None]:
np.random.seed(7) # definindo uma seed global

# Pipelines para madronização do dataset e aplicação do modelo
pipelines = []
pipelines.append(('NormKNN', Pipeline([('MinMaxScaler', MinMaxScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('NormCART', Pipeline([('MinMaxScaler', MinMaxScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('NormNB', Pipeline([('MinMaxScaler', MinMaxScaler()),('NB', GaussianNB())])))
pipelines.append(('NormSVM', Pipeline([('MinMaxScaler', MinMaxScaler()),('SVM', SVC(gamma='auto'))])))

results = []
names = []

# Aplicação dos pipelines
for name, model in pipelines:
  kfold = KFold(n_splits=folds)
  cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

# boxplot de comparação dos algoritmos
fig = plt.figure() 
fig.suptitle('Comparação da Acurácia dos Algoritmos - MinMaxScaler') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show()

## Tuning the parameters with GridSearch

In [None]:
# Tuning do KNN

# padronização dos dados
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# hiperparâmetros que serão avaliados
k = [1,3,5,7,9,11,13,15,17,19,21]
distancias = ["euclidean", "manhattan", "minkowski"]
param_grid = dict(n_neighbors=k, metric=distancias)

# modelo
model = KNeighborsClassifier()
kfold = KFold(n_splits=10)

# busca exaustiva de hiperparâmetros com GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)

# imprime o melhor resultado
print("Melhor: %f usando %s" % (grid_result.best_score_, grid_result.best_params_)) 

# imprime todos os resultados
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f): %r" % (mean, stdev, param))

## The predictions

In [None]:
# https://www.mikulskibartosz.name/how-to-set-the-global-random_state-in-scikit-learn/
np.random.seed(7) # definindo uma seed global 

# padronização os dados de treino
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# criando um modelo com a melhor configuração até o momento
model = SVC(C=0.1, kernel='linear')
model.fit(rescaledX, Y_train)

# padronização dos dados de teste
rescaledTestX = scaler.transform(X_test)

# predição
predictions = model.predict(rescaledTestX)

# estimando a acurácia no conjunto de teste
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))