# Sistema de Recomendação

In [1]:
import re
import glob
from multiprocessing import Pool
import numpy as np
import pandas as pd
import nltk

## Pré-Processamento

Constantes usadas no processamento das legendas:

In [2]:
TITLE = re.compile("(\/)(.*?)(\.)")
HTML_TAG = re.compile("<.*?>")

STOP_WORDS = nltk.corpus.stopwords.words('portuguese')

In [3]:
subs = pd.read_csv("./categories.tsv", sep="\t", names=["filepath", "genre"])
subs.head(5)

Unnamed: 0,filepath,genre
0,10000 B.C.(2008).XViD-PreVaill.br.srt,Ação
1,127 Hours (2010).BDRip.Larceny.br.srt,Aventura
2,12 Rounds.DVDRip.aXXo.br.srt,Crime
3,15 Minutes(2001).br.srt,Crime
4,17 Again.720p.REFiNED.br.srt,Comédia


In [4]:
def canonize(text):
    text = HTML_TAG.sub("", text)
    return text

def parse_text(filepath):
    def parse_block(block):
        lines   = block.split('\n')
        txt     = ' '.join(lines[2:])
        txt     = canonize(txt)
        return txt
    
    # We don't consider the first and last three blocks, since usually they're credits for
    # the translators and/or style definition for the subtitles.
    with open(filepath, encoding="latin-1") as f:
        sub_file = f.read()
        sub_file = sub_file.strip().replace('\r', '').split('\n\n')[3:-3]
        lines = map(parse_block, sub_file)
        return ' '.join(lines).strip()

In [5]:
subtitle = []

for row in subs['filepath']:
    filepath = 'Legendas/' + row
    try:
        text = parse_text(filepath)
    except FileNotFoundError:
        text = None

    subtitle.append(text)

# add subtitle column to our subs df
subs['subtitle'] = subtitle

# remove rows with NAs
subs = subs.dropna()

subs.head(5)

Unnamed: 0,filepath,genre,subtitle
0,10000 B.C.(2008).XViD-PreVaill.br.srt,Ação,E será sussurrada aos quatro ventos das grande...
1,127 Hours (2010).BDRip.Larceny.br.srt,Aventura,mas pense no que vamos tocar. Por favor. Preci...
2,12 Rounds.DVDRip.aXXo.br.srt,Crime,"Revisão: Bozano, Nininha e Virtualnet. 00:00:..."
3,15 Minutes(2001).br.srt,Crime,-Não perca tempo. -Está bem Ouviu o que eu dis...
4,17 Again.720p.REFiNED.br.srt,Comédia,"O'Donnell, poupe-se para o jogo! Só estou aque..."


## Modelos

### Matriz TF-IDF

Agora temos em mãos todos os dados que iremos precisarpara realizar as nossas análises e treinar nossos modelos de predição.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer


def tokenize(text, stem=False):
    ''' Tokenizer
    
    receives text (string) and return list of tokenized text, can receive an extra parameter in order
    to stem the strings.
    '''
    sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
    
    tokens = [word.lower() for sentence in sent_tokenizer.tokenize(text) \
              for word in nltk.word_tokenize(sentence) if word.isalpha()]
    if stem:
        stemmer = nltk.stem.RSLPStemmer()
        stems = [stemmer.stem(token) for token in tokens]
        return stems
    else:
        return tokens
    
def tokenize_and_stem(text):
    return tokenize(text, stem=True)

# define TF-IDF parameters (w/o stemming)
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                   min_df=0.2, stop_words=STOP_WORDS,
                                  use_idf=True, tokenizer=tokenize, ngram_range=(1,3))

# define TF-IDF parameters (w/ stemming)
tfidf_vectorizer_stemmed = TfidfVectorizer(max_df=0.8, max_features=200000,
                                   min_df=0.2, stop_words=STOP_WORDS,
                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

Tendo as matrizes dos termos e documentos usando o algoritmo _TF-IDF_, podemos então treinar nos classificadores utilizando a matriz com os termos com e sem _stemming_.

In [7]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(subs['subtitle'])
print("shape of TF-IDF Matrix: ", tfidf_matrix.shape)

# adding a blank line in-between
print()

%time tfidf_matrix_stemmed = tfidf_vectorizer_stemmed.fit_transform(subs['subtitle'])
print("shape of TF-IDF Matrix (stemmed tokens): ", tfidf_matrix_stemmed.shape)

CPU times: user 2min 44s, sys: 2.76 s, total: 2min 46s
Wall time: 2min 52s
shape of TF-IDF Matrix:  (644, 1648)

CPU times: user 6min 13s, sys: 3.23 s, total: 6min 16s
Wall time: 6min 26s
shape of TF-IDF Matrix (stemmed tokens):  (644, 2027)


### K-Nearest neighbors

lembrar que indice retorna sempre a si mesmo, com uma distancia zero, ja que usamo o mesmos dados para treinar e para verificar.

In [8]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=12).fit(tfidf_matrix)
distantes, indices = nbrs.kneighbors(tfidf_matrix)

In [9]:
indices

array([[  0, 506, 362, ..., 266, 170,  72],
       [  1, 536, 124, ..., 131, 266, 151],
       [  2, 362, 131, ..., 576, 360, 506],
       ..., 
       [641, 161, 266, ..., 536, 361, 124],
       [642, 360, 151, ..., 576, 361, 124],
       [643, 170, 350, ..., 266, 536, 361]])

### Experiment

In [236]:
def random_choose(matrix, n):
    num_rows = tfidf_matrix.shape[0]
    random_row_ids = np.random.choice(num_rows, n, replace=False)
    
    return random_row_ids

def get_top_recomendations(choosed, matrix, k):
    # Get the mean of all the choosed vectors     
    avg_vector = sum([matrix.getrow(idx) for idx in choosed]) / len(choosed)
    
    cos_similarities = linear_kernel(avg_vector, matrix)[0]
    # get the nearest neighbors to the new avg_vector
    recommended = cos_similarities.argsort()[:-(50+k):-1]
    # we usually get entries from the choosed vector, since we're
    # using the same dataset to test and to search for recommendations.
    recommended = [(cos_similarities[i], i) for i in recommended if i not in choosed]
    
    return recommended[:k]

def make_experiment_row(k, n, choosed, recommended):
    avg_similarity = sum(elem[0] for elem in recommended) / len(recommended)
    recommended = [elem[1] for elem in recommended]
    
    return {'N':n,
            'K': k,
            'Choosed':choosed,
            'Recommended':recommended,
            'Avg. Similarity':avg_similarity,
           }

In [237]:
def run_experiment(matrix, Ns, Ks):
    # Number of experiments is equal to len(Ns) * len(Ks)
    index = np.arange(len(Ns) * len(Ks))
    results = []
    for n in Ns:
        choosed = random_choose(matrix, n)
        for k in Ks:
            # Columns
            recomendations = get_top_recomendations(choosed, matrix, k)
            # Make new row and append to results
            result_row = make_experiment_row(k, n, choosed, recomendations)
            results.append(result_row)
            
    return results

In [238]:
Ns = [2, 4, 8, 16, 32, 64]
Ks = [5,10, 15]

results = run_experiment(tfidf_matrix, Ns, Ks)

df = pd.DataFrame(results, columns=['N', 'K', 'Choosed', 'Recommended', 'Avg. Similarity'])
df

Unnamed: 0,N,K,Choosed,Recommended,Avg. Similarity
0,2,5,"[629, 561]","[375, 308, 552, 224, 291]",0.368934
1,2,10,"[629, 561]","[375, 308, 552, 224, 291, 567, 377, 37, 367, 215]",0.362115
2,2,15,"[629, 561]","[375, 308, 552, 224, 291, 567, 377, 37, 367, 2...",0.355925
3,4,5,"[567, 350, 617, 136]","[85, 405, 423, 492, 430]",0.291633
4,4,10,"[567, 350, 617, 136]","[85, 405, 423, 492, 430, 544, 340, 603, 377, 193]",0.285011
5,4,15,"[567, 350, 617, 136]","[85, 405, 423, 492, 430, 544, 340, 603, 377, 1...",0.280221
6,8,5,"[188, 364, 412, 101, 486, 599, 140, 329]","[164, 622, 377, 234, 55]",0.364421
7,8,10,"[188, 364, 412, 101, 486, 599, 140, 329]","[164, 622, 377, 234, 55, 245, 176, 500, 340, 451]",0.35892
8,8,15,"[188, 364, 412, 101, 486, 599, 140, 329]","[164, 622, 377, 234, 55, 245, 176, 500, 340, 4...",0.355476
9,16,5,"[472, 520, 278, 388, 545, 402, 156, 39, 221, 5...","[167, 234, 164, 622, 594]",0.339588


### Análise

Para a nossa análise, consideramos N = {2, 4, 8, 16, 32, 64} e K = {5, 10, 15}, tambem vale salientar que a escolha de filmes que o usuario supostamente já assistiu e gostou foi feita de forma randomica, e o mesmo conjunto utilizado para os três possiveis tamanhos para K.

Para avaliar a semelhança calculei a média da similaridade dos cossenos, dos resultados sugestionados, portanto quando maior o valor mais similar os filmes são entre si.

Não consegui encontrar diferença significativa entre os resultados encontrados, portanto para uma escolhe de forma aleatoria, a qualidade dos resultados é estatisticamente o mesmo.

Uma futura análise, com dados de gosto real de um usuario pode levar a diferentes conclusões.