## Importación de librerías y datos 

In [3]:
#!pip install ydata_profiling pandas_profiling pyarrow fastparquet huggingface_hub ipywidgets nltk wordcloud matplotlib transformers torch > /dev/null

[31mERROR: Cannot install pandas-profiling==3.0.0, pandas-profiling==3.1.0, pandas-profiling==3.2.0 and ydata-profiling==4.7.0 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [1]:
from ydata_profiling import ProfileReport
import pandas as pd

In [2]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}

# Cargar los datos de entrenamiento y prueba en DataFrames de pandas
train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

## EDA

In [3]:
# Contar la cantidad de registros y columnas en cada conjunto de datos
train_count = train_df.shape
test_count = test_df.shape

print(f"Cantidad de registros y columnas en el conjunto de entrenamiento: {train_count}")
print(f"Cantidad de registros y columnas en el conjunto de prueba: {test_count}")

Cantidad de registros y columnas en el conjunto de entrenamiento: (25000, 2)
Cantidad de registros y columnas en el conjunto de prueba: (25000, 2)


La base esta particionada equitativamente entre datos de entrenamiento y testeo. Junto los conjuntos de datos para hacer el análisis exploratorio y luego particiono nuevamente en una proporción más beneficiosa para el entrenamiento del modelo.

In [4]:
df = pd.concat([train_df, test_df], axis=0)

In [5]:
# Profile Report
profile = ProfileReport(df, title="Informe Exploratorio", explorative=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



A partir del profile report observé las siguientes características:
- 406 filas estan duplicadas por lo que prodeceré a eliminar duplicados.
- Prevalecen en el texto gran cantidad de stopwords que buscaré eliminar.
- El dataset se encuentra uniformemente distribuido.
- No hay missings.
- El conjunto de datos de entrenamiento y testeo se encuentra igualmente distribuido por lo que modificaré eso para que el train contenga el 80% y el test el 20%.

In [6]:
# Se eliminan filas duplicadas
df = df.drop_duplicates()

In [7]:
# Eliminar las etiquetas HTML de la columna 'text'
df['text'] = df['text'].str.replace(r'<[^>]*>', '', regex=True)

# Eliminar palabras comunes y sin valor predictivo (ejemplo con stopwords de NLTK)
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['text'] = df['text'].apply(remove_stopwords)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.replace(r'<[^>]*>', '', regex=True)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/achula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/achula/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(remove_stopwords)


In [8]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Combina todos los textos en una sola cadena
text = " ".join(review for review in df["text"])

# Genera la wordcloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Muestra la wordcloud usando matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

  plt.show()


WordCloud sin stopwords:

In [9]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from io import BytesIO
from PIL import Image

# Combina todos los textos en una sola cadena
text = " ".join(review for review in df["text"])

# Genera la wordcloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Guardar la wordcloud en un objeto BytesIO
img_buffer = BytesIO()
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(img_buffer, format='png')
plt.close()

# Mostrar la wordcloud desde el objeto BytesIO
img_buffer.seek(0)
img = Image.open(img_buffer)
img.show()

In [10]:
import sklearn.model_selection
from sklearn.model_selection import train_test_split

# Dividir el DataFrame en conjunto de entrenamiento (80%) y conjunto de prueba (20%)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Mostrar los resultados
print("Conjunto de Entrenamiento:")
print(df_train)
print("\nConjunto de Prueba:")
print(df_test)

Conjunto de Entrenamiento:
                                                    text  label
7865   big fan original book adaption simply bad . Fi...      0
4819   think cartoon one worst cartoons ever watched ...      0
10530  one poor attempt spinning old `` cons turn goo...      0
3450   Much like early horror film Boogens , devious ...      0
24471  went movie theater afternoon expecting underwh...      1
...                                                  ...    ...
11337  get amazed BAD film , world anybody could rais...      0
20091  n't see people giving film negative reviews ? ...      1
13491  `` goofs '' section film 's comment effect mis...      1
863    Actually , flick , made 1999 , pretty good pro...      0
15869  interesting documentary tells remarkable tale ...      1

[39665 rows x 2 columns]

Conjunto de Prueba:
                                                    text  label
4230   producers film sued misrepresentation copyrigh...      0
18634  plot starts interesting

## BERT

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification

# Cargar el tokenizador y el modelo BERT preentrenado para clasificación de secuencias
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import torch

train_encodings = tokenizer(df_train['text'].tolist(), truncation=True, padding=True, return_tensors='pt', max_length=128)
test_encodings = tokenizer(df_test['text'].tolist(), truncation=True, padding=True, return_tensors='pt', max_length=128)

train_labels = torch.tensor(df_train['label'].tolist())
test_labels = torch.tensor(df_test['label'].tolist())

In [17]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [18]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [20]:
# Definir el dispositivo (GPU o CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Mover el modelo al dispositivo seleccionado
model.to(device)

train_losses = []

for epoch in range(3):  # Entrenamos el modelo por 3 épocas
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], batch)}
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Época {epoch + 1} completada con pérdida de entrenamiento {avg_train_loss:.4f}")

Época 1 completada con pérdida de entrenamiento 0.3218
Época 2 completada con pérdida de entrenamiento 0.1865
Época 3 completada con pérdida de entrenamiento 0.0940


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
total_test_loss = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in zip(['input_ids', 'attention_mask', 'labels'], batch)}
        outputs = model(**batch)
        loss = outputs.loss
        total_test_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

avg_test_loss = total_test_loss / len(test_loader)

accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print(f"Train Loss: {train_losses[-1]:.4f}")
print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Train Loss: 0.0940
Test Loss: 0.2929
Accuracy: 0.8939
Precision: 0.8869
Recall: 0.9020
F1 Score: 0.8944


## Modelo Preentrenado de Hugging Face

In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AdamW

In [24]:
# Cargar el tokenizador y el modelo
tokenizer = AutoTokenizer.from_pretrained("Deysi/sentiment_analysis_imbd")
model = AutoModelForSequenceClassification.from_pretrained("Deysi/sentiment_analysis_imbd")



In [25]:
# Tokenizar el conjunto de datos
train_encodings = tokenizer(df_train['text'].tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(df_test['text'].tolist(), truncation=True, padding=True, return_tensors='pt')

In [26]:
# Convertir Etiquetas a Tensores
train_labels = torch.tensor(df_train['label'].tolist())
test_labels = torch.tensor(df_test['label'].tolist())

In [27]:
# Crear DataLoaders
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [28]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Evaluar el modelo en el conjunto de prueba
model.eval()

total_test_loss = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_test_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

avg_test_loss = total_test_loss / len(test_loader)

accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Test Loss: 0.2398
Accuracy: 0.9136
Precision: 0.9137
Recall: 0.9136
F1 Score: 0.9136
