In [1]:
#Habilitar intellisense
%config IPCompleter.greedy=True

In [8]:
import nltk
import string
import numpy as np
import pandas as pd
import collections
import math
import os
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import pickle

english_stop_words = set( nltk.corpus.stopwords.words('english') + list(string.punctuation))


In [3]:
movies_ds = pd.read_csv(os.path.join("imdb_dataset.csv"),encoding = "ISO-8859-1")

In [4]:
def clean_markups(comment):
    soup = BeautifulSoup(comment, "html.parser")
    comment = soup.get_text()
    return comment

def clean_special_characters(comment):
    comment = re.sub('\[[^]]*\]', ' ', comment)
    comment = re.sub('[^a-zA-Z]', ' ', comment)
    return comment

def clean_stop_words(comment):
    english_stop_words = set( nltk.corpus.stopwords.words('english') + list(string.punctuation))
    comment = [word for word in comment if not word in english_stop_words]
    return comment

def clean_review(comment):
    comment = clean_markups(comment)
    comment = clean_special_characters(comment)
    return comment

def tokenize(comment):
    comment = comment.split()
    return comment

def get_corpus_text(comment):
    comment = clean_review(comment)
    comment = comment.lower()
    comment = tokenize(comment)
    comment = clean_stop_words(comment)

    #optional
    lem = WordNetLemmatizer()
    comment = [lem.lemmatize(word) for word in comment]
    
    comment = ' '.join(comment)
    corpus = comment
    return corpus

In [5]:
X_train, X_test, y_train, y_test = train_test_split(movies_ds['Review'], movies_ds['Label'], test_size=0.3, random_state=42)

y_train = (y_train.replace({'pos': 1, 'neg': 0})).values
y_test  = (y_test.replace({'pos': 1, 'neg': 0})).values

corpus_train = [get_corpus_text(comment) for comment in X_train]
corpus_test  = [get_corpus_text(comment) for comment in X_test]


In [6]:
tfIdfv = TfidfVectorizer(norm='l2')

tfIdfv_train= tfIdfv.fit_transform(corpus_train)
tfIdfv_test = tfIdfv.transform(corpus_test)

clasificador_reg_log = LogisticRegression(random_state=0, solver='liblinear')
clasificador_reg_log.fit(tfIdfv_train,y_train)

# accuracy
print('accuracy del clasificador - version 1 : {0:.2f}'.format(accuracy_score(y_test, clasificador_reg_log.predict(tfIdfv_test))))
# confusion matrix
print('matriz de confusión del clasificador - version 1: \n {0}'.format(confusion_matrix(y_test, clasificador_reg_log.predict(tfIdfv_test))))
# precision 
print('precision del clasificador - version 1 : {0:.2f}'.format(precision_score(y_test, clasificador_reg_log.predict(tfIdfv_test))))
# recall 
print('recall del clasificador - version 1 : {0:.2f}'.format(recall_score(y_test, clasificador_reg_log.predict(tfIdfv_test))))
# f1
print('f1 del clasificador - version 1 : {0:.2f}'.format(f1_score(y_test, clasificador_reg_log.predict(tfIdfv_test))))



accuracy del clasificador - version 1 : 0.90
matriz de confusión del clasificador - version 1: 
 [[6674  883]
 [ 651 6792]]
precision del clasificador - version 1 : 0.88
recall del clasificador - version 1 : 0.91
f1 del clasificador - version 1 : 0.90


In [10]:
ruta_archivo_clasificador = os.path.join("logistic_regression.pkl")
archivo_clasificador = open(ruta_archivo_clasificador, "wb")
pickle.dump(clasificador_reg_log, archivo_clasificador)
archivo_clasificador.close()