In [229]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import os
from clean_text import TextCleaner
import re
import nltk

In [230]:
#! pip install spacy
#! python -m spacy download pt_core_news_sm
#! python -m spacy download en
nltk.download('punkt', quiet = True)

True

# 1. Implementação do Bayesian Sets

In [231]:
def score(X, x, c = 2):
    m = X.mean(0)
    N = x.shape[0]
    alfa = c * m
    beta = c * (1 - m)
    alfa_ = alfa + x.sum(0)
    beta_ = beta + N - x.sum(0)
    nc = (np.log(alfa + beta) - np.log(alfa + beta + N) + np.log(beta_) - np.log(beta)).sum(1)
    q = np.log(alfa_) - np.log(alfa) + np.log(beta) - np.log(beta_)
    s = nc + (X * q.T)
    
    return s

def search(query, corpus, clean_text = True):    
    # clean text
    if clean_text:
        cleaner = TextCleaner(clean_regex = ".*")
        query = cleaner.clean(query)
        corpus = cleaner.clean(corpus)
    
    # create DTM
    vec = CountVectorizer().fit(corpus)
    DTM = vec.transform(corpus)
    DTM_query = vec.transform(query)
    
    # calculate scores
    s = score(DTM, DTM_query)
    
    return s

# 2. Utilizando o algoritmo e fazendo limpeza no dataset

In [233]:
movies = '~/github/data/datasets/movies.csv'

In [234]:
df = pd.read_csv(movies)

# remove release year from movie title
df['title'] = df['title'].apply(lambda x: re.sub("\\([0-9]+\\)", "", x))

s = search(['toy story', 'the lion king','alladin','beauty and the beast','cinderella','little mermaid','hercules'] , df['title'])

data = {'title': df['title'], 'value': np.asarray(s).tolist()}
df = pd.DataFrame(data)
df['value'] = df['value'].apply(lambda x: x[0])
df.sort_values('value', ascending = False).iloc[0:30, :]

Unnamed: 0,title,value
1997,"Little Mermaid, The",10.091983
23240,Beauty and the Beast,9.96377
589,Beauty and the Beast,9.96377
23241,Beauty and the Beast (Beauty and the Beasts: A...,9.944825
360,"Lion King, The",9.696858
0,Toy Story,9.044807
3027,Toy Story 2,9.044807
15401,Toy Story 3,9.044807
7960,"Cinderella Story, A",9.044807
9398,"Lion King 1½, The",8.192724


# 3. Crie um classificador utilizando word2vec

In [191]:
movie_review = '~/github/data/datasets/movie_review.csv'

In [217]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import time

df = pd.read_csv(movie_review)

stop_words = set(stopwords.words('english'))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [218]:
def review_to_wordlist(review, remove_stopwords=False):
    # remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)
    
    # convert to lower case and split at whitespace
    words = review_text.lower().split()
    
    # remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words

def review_to_sentences(review, tokenizer, remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [219]:
sentences = [review_to_sentences(review, tokenizer, remove_stopwords = True)[0] for review in df['text']]

In [220]:
# Set values for various word2vec parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 4   # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
context = 3           # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
model = Word2Vec(
    sentences,
    workers=num_workers,
    size=num_features,
    min_count = min_word_count,
    window = context, 
    sample = downsampling)

model.init_sims(replace=True)

In [221]:
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index2word)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    counter = 0
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter,:] = make_feature_vec(review, model, num_features)
        counter = counter + 1
    return review_feature_vecs

In [222]:
clean_reviews = []
for review in df['text']:
    clean_reviews.append(review_to_wordlist(review, remove_stopwords=True))
trainDataVecs = get_avg_feature_vecs(clean_reviews, model, num_features)

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [223]:
# removendo linhas com valores nan
not_null = ~np.isnan(trainDataVecs).any(axis=1)
X = trainDataVecs[not_null]
y = df['tag'][not_null]

In [224]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
model = RandomForestClassifier(n_estimators= 100)
t0 = time.time()
model.fit(X_train, y_train)
t1 = time.time()
print("time to fit: %.2f" % (t1 - t0))
y_pred = model.predict(X_test)
t2 = time.time()
print("time to predict: %.2f" % (t2 - t1))

time to fit: 204.60
time to predict: 1.23


In [225]:
confusion_matrix(y_test, y_pred)

array([[4853, 4562],
       [4181, 5663]], dtype=int64)

In [226]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.54      0.52      0.53      9415
         pos       0.55      0.58      0.56      9844

   micro avg       0.55      0.55      0.55     19259
   macro avg       0.55      0.55      0.55     19259
weighted avg       0.55      0.55      0.55     19259

