In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

## Implementação do algoritimo Bayesian Sets e consulta com 7 queries (etapas 1 e 2)

#### Carregar e tratar os dados

In [2]:
corpus = pd.read_csv("data/movies.csv")
query_list = ['toy story', 'the lion king','alladin','beauty and the best','cinderella','little mermaid','hercules']
corpus.loc[:,"title"] = corpus.title.apply(lambda t : re.sub(r'\([^)]*\)',"", t))

corpus.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


#### Criar as dtms

In [3]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus.title)
X = vectorizer.transform(corpus.title)
x = vectorizer.transform(query_list)

#### Implementar o algorítimo

In [4]:
c = 2
m = np.mean(X, 0) + 0.0000000001 # somar um numero pequeno para evitar divisões por zero
N = x.shape[0]
xij = x.toarray()

alpha = c * m
beta = c * (1 - m)
alpha_t = alpha + np.sum(xij, 0)
beta_t = beta + N - np.sum(xij, 0)
nc = np.sum(np.log(alpha + beta) - np.log(alpha + beta +N) + np.log(beta_t) - np.log(beta),1)
q = np.log(alpha_t) - np.log(alpha) - np.log(beta_t) + np.log(beta)

#### Calcular o score

In [5]:
s = nc + np.sum(X.multiply(q),1)

#### "Planificar" e organizar o score pelos melhores resultados

In [6]:
s_flat = np.array(s).reshape((s.shape[0],))
c_indexes=s_flat.argsort()[::-1][:20]

#### Relacionar os resultados com o dataset original

In [7]:
result = pd.DataFrame(corpus.iloc[c_indexes]['title'])
result['score'] = s_flat[c_indexes]

In [8]:
result

Unnamed: 0,title,score
1997,"Little Mermaid, The",10.056046
360,"Lion King, The",9.516822
0,Toy Story,8.917427
15401,Toy Story 3,8.917427
3027,Toy Story 2,8.917427
7960,"Cinderella Story, A",8.917427
9398,"Lion King 1½, The",8.012687
18252,Another Cinderella Story,7.412351
4473,Best of the Best,7.381584
4474,Best of the Best 2,7.381584


## Classificador binário utilizando word2vec (etapa 3)

In [2]:
from gensim.models import Word2Vec
import random
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm



In [6]:
df = pd.read_csv("data/movie_review.csv")

In [7]:
df = df[["text","tag"]]
df = shuffle(df)
df = df.reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,text,tag
0,he has become an unquestioning weapon .,pos
1,they walk in unison and they only look forward .,pos
2,and then there's the special effects .,neg
3,it's likely that the film will miss a wide-spr...,pos
4,the initially interesting plot is done even mo...,neg


In [3]:
def TreatText(data):    
    stops = set(stopwords.words("english"))  
    data['text'] = [re.sub("[^a-zA-Z]", " ",data['text'][i]) for i in range(len(data))] 
    data['text'] = [word_tokenize(data['text'][i].lower()) for i in range(len(data))] 
    data['text'] = [[w for w in data['text'][i] if w not in stops]for i in range(len(data))]
    return(data)

In [10]:
df = TreatText(df)

In [11]:
labels = np.array(df["tag"])

In [12]:
def meanVector(model,phrase):
    vocab = model.wv.vocab
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]    
    if phrase == []:
        vetor = [0.0]*300 
    else:         
        vetor = np.mean([model[word] for word in phrase],axis=0)
    return vetor

In [13]:
def createFeatures(base,model): 
    features = [meanVector(model,base['text'][i])for i in range(len(base))]
    return features

In [14]:
model_skip = Word2Vec(df["text"], sg=1, min_count=10, size = 300, window=4, workers=8)
model_cbow = Word2Vec(df["text"], sg=0, min_count=10, size = 300, window=4, workers=8)   

In [15]:
df_skip = createFeatures(df, model_skip)
df_cbow = createFeatures(df, model_cbow)

  


In [16]:
def train_test(base_df,limit):
    X_train, X_test, y_train, y_test = train_test_split(base_df[0:limit], labels[0:limit], test_size=0.3, random_state=109)
    clf = svm.SVC(kernel="linear")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_test,y_pred   

In [23]:
sg_test,sg_pred = train_test(df_skip,5000)
cb_test,cb_pred = train_test(df_cbow,5000)

#### Resultado com skipgram

In [24]:
print(classification_report(sg_test,sg_pred))

              precision    recall  f1-score   support

         neg       0.60      0.48      0.53       730
         pos       0.59      0.70      0.64       770

   micro avg       0.59      0.59      0.59      1500
   macro avg       0.59      0.59      0.59      1500
weighted avg       0.59      0.59      0.59      1500



#### Resultado com cbow

In [25]:
print(classification_report(cb_test,cb_pred))

              precision    recall  f1-score   support

         neg       0.56      0.37      0.45       730
         pos       0.55      0.73      0.63       770

   micro avg       0.55      0.55      0.55      1500
   macro avg       0.56      0.55      0.54      1500
weighted avg       0.56      0.55      0.54      1500



##### O f1-score com skip-gram foi superior para as duas classes.

### Execução do algoritimo LDA no mesmo dataset, utilizando 2 tópicos. (etapa 4)

In [4]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [5]:
df = pd.read_csv("data/movie_review.csv")
df = df[["text","tag"]]
df = shuffle(df)
df = df.reset_index(drop=True)

In [7]:
labels = np.array(df["tag"])

In [6]:
df = TreatText(df)

In [8]:
train_text, test_text, labels_train, labels_test = train_test_split(df, labels, test_size=0.3)

In [10]:
common_dic = Dictionary(train_text.text)
common_corpus = [common_dic.doc2bow(t) for t in train_text.text]

In [11]:
test_corpus = [common_dic.doc2bow(t) for t in test_text.text]

In [12]:
lda = LdaModel(common_corpus, num_topics=2)

In [13]:
cdf = pd.DataFrame(test_text["tag"])
cdf["lda_topic"] = [max(lda[x],key=lambda item: item[1])[0] for x in test_corpus]
cdf["lda_prob"] = [max(lda[x],key=lambda item: item[1])[1] for x in test_corpus]

In [15]:
cdf.head(10)

Unnamed: 0,tag,lda_topic,lda_prob
62205,neg,0,0.728866
23761,neg,1,0.73285
5259,pos,0,0.551229
34027,pos,1,0.868772
50988,neg,0,0.736056
60517,pos,0,0.7051
2833,neg,0,0.876939
34661,pos,1,0.73764
981,pos,0,0.721695
20954,pos,0,0.669952


In [23]:
pos_0 = len(cdf[(cdf.tag == "pos") & (cdf.lda_topic == 0)])
pos_1 = len(cdf[(cdf.tag == "pos") & (cdf.lda_topic == 1)])
neg_0 = len(cdf[(cdf.tag == "neg") & (cdf.lda_topic == 0)])
neg_1 = len(cdf[(cdf.tag == "neg") & (cdf.lda_topic == 1)])
t = len(cdf)
            
print("Positivos cujo tópico encotrado foi 0: ")
print("%s (%s)" % (pos_0, pos_0 / t))
print("Positivos cujo tópico encotrado foi 1: ")
print("%s (%s)" % (pos_1, pos_1 / t))
print("Negativos cujo tópico encotrado foi 0: ")
print("%s (%s)" % (neg_0, neg_0 / t))
print("Negativos cujo tópico encotrado foi 1: ")
print("%s (%s)" % (neg_1, neg_1 / t))

Positivos cujo tópico encotrado foi 0: 
5987 (0.30835393489905233)
Positivos cujo tópico encotrado foi 1: 
3990 (0.20550061804697156)
Negativos cujo tópico encotrado foi 0: 
6026 (0.3103625875566543)
Negativos cujo tópico encotrado foi 1: 
3413 (0.1757828594973218)


### Tanto reviews rotuladas como positivas, quanto negativas tiveram maior incidência no tópico 0.

Isso indica ou que o algorítimo nao foi implementado corretamente ou que o LDA não é adequado para esta tarefa. Considerando que tanto as reviews positivas quanto as negativas são sobre o mesmo tópico, filmes, é bastante provável que o LDA não seja o algorítimo mais indicado para fazer a segmentação binária destes dados.