### Import libraries and data preprocessing

In [1]:
import re
import unicodedata
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

In [3]:
# Load imdb data
imdb = pd.read_parquet("datasets/sample_imdb.parquet")
imdb.head()

Unnamed: 0,review,sentiment
19374,"Believe it or don't, i have my very own DVD co...",negative
39607,Spoiler Alert <br /><br />I have never seen co...,positive
27294,"Career criminal and crime boss, Abel Davos (Li...",positive
18617,It was a terrific movie! I like to watch it ag...,positive
14994,*** out of ****<br /><br />Yep! Dressed To Kil...,positive


In [4]:
# Limpiamos el texto
imdb.review = imdb.review.apply(preprocess_text)

### Modelos

In [5]:
# Importamos librerias
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Seleccionamos las variables de entrada y el target
X = imdb.review
y = imdb.sentiment

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
## Modelación con CountVectorizer

# Creacion del pipeline
model_count = Pipeline(steps=[
    ("count-vectorizer", CountVectorizer(max_features=3000, preprocessor=preprocess_text)),
    ("logit", LogisticRegression())
])

# Ajustamos el modelo
model_count.fit(X_train, y_train)

# Medimos el accuracy del modelo
model_count.score(X_test, y_test)

0.8505

In [11]:
## Modelación con TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Creacion del pipeline
model_tfidf = Pipeline(steps=[
    ("count-vectorizer", TfidfVectorizer(max_features=3000, preprocessor=preprocess_text)),
    ("logit", LogisticRegression())
])

# Ajustamos el modelo 
model_tfidf.fit(X_train, y_train)

# Medimos el accuracy del modelo
model_tfidf.score(X_test, y_test)

0.871

### Using embedings

In [16]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Dividimos los datos en entrenamiento y testeo
X = imdb.review.tolist()
y = imdb.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Encoding using sentence transformer model
X_train_embed = embedding_model.encode(X_train, batch_size=512)
X_test_embed = embedding_model.encode(X_test, batch_size=512)

In [19]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_embed, y_train)

model.score(X_test_embed, y_test)

0.813