In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

## Implementação do algoritimo Bayesian Sets e consulta com 7 queries (1 e 2)

#### Carregar e tratar os dados

In [2]:
corpus = pd.read_csv("data/movies.csv")
query_list = ['toy story', 'the lion king','alladin','beauty and the best','cinderella','little mermaid','hercules']
corpus.loc[:,"title"] = corpus.title.apply(lambda t : re.sub(r'\([^)]*\)',"", t))

corpus.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


#### Criar as dtms

In [3]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus.title)
X = vectorizer.transform(corpus.title)
x = vectorizer.transform(query_list)

#### Implementar o algorítimo

In [4]:
c = 2
m = np.mean(X, 0) + 0.0000000001 # somar um numero pequeno para evitar divisões por zero
N = x.shape[0]
xij = x.toarray()

alpha = c * m
beta = c * (1 - m)
alpha_t = alpha + np.sum(xij, 0)
beta_t = beta + N - np.sum(xij, 0)
nc = np.sum(np.log(alpha + beta) - np.log(alpha + beta +N) + np.log(beta_t) - np.log(beta),1)
q = np.log(alpha_t) - np.log(alpha) - np.log(beta_t) + np.log(beta)

#### Calcular o score

In [5]:
s = nc + np.sum(X.multiply(q),1)

#### "Planificar" e organizar o score pelos melhores resultados

In [6]:
s_flat = np.array(s).reshape((s.shape[0],))
c_indexes=s_flat.argsort()[::-1][:20]

#### Relacionar os resultados com o dataset original

In [7]:
result = pd.DataFrame(corpus.iloc[c_indexes]['title'])
result['score'] = s_flat[c_indexes]

In [8]:
result

Unnamed: 0,title,score
1997,"Little Mermaid, The",10.056046
360,"Lion King, The",9.516822
0,Toy Story,8.917427
15401,Toy Story 3,8.917427
3027,Toy Story 2,8.917427
7960,"Cinderella Story, A",8.917427
9398,"Lion King 1½, The",8.012687
18252,Another Cinderella Story,7.412351
4473,Best of the Best,7.381584
4474,Best of the Best 2,7.381584
