###Importacion de librerías y lectura de datos


In [None]:
!pip install ydata-profiling

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from ydata_profiling import ProfileReport

# Pandas
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas
pd.set_option('display.max_rows', 50) # Numero máximo de filas

# Numpy
import numpy as np
np.random.seed(3301)

# Seaborn
import seaborn as sns 

#json Files
import json

# Matplolib
%matplotlib inline
import matplotlib.pyplot as plt

# librerias para trabajar con texto
import unicodedata
import string
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import re
import contractions
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
import inflect
import nltk

# Vectorización de texto
from sklearn.feature_extraction.text import TfidfVectorizer

# Librerías para pipeline y su composicion
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

# Modelos
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

# Import SGD Classifier
from sklearn.linear_model import SGDClassifier

# Métricas
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Optimización modelo
from sklearn.model_selection import GridSearchCV

# Guardar modelo
import joblib


# Punkt permite separar un text en frases.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
db_route = 'sample_data/processed_data.csv'
df_originales = pd.read_csv(db_route, encoding = 'ISO-8859-1')

# Imprimir número de filas
print('Número de filas: ', df_originales.shape[0])
# Imprimir número de columnas
print('Número de columnas: ', df_originales.shape[1])

pd.set_option('display.max_columns', None)
df_originales.sample(5)

Número de filas:  5000
Número de columnas:  2


Unnamed: 0,sentimiento,tokens
2224,0,confieso emm en mi opinin es la mej novel gran...
1448,1,te gust el hombr muerto us cuadrosqu pas con e...
2870,1,est fue el mej docu que visto en mi vid acabo ...
2895,0,slith hor comedy real enough hor comedy qual o...
3039,0,desp de un largo perodo en el espacio en busc ...


In [None]:
# Eliminar valores nulos
df_originales.dropna(inplace=True)

In [None]:
# Parametros GridSearchCV
parameters = {
    'n_estimators': [100, 120],
    'criterion' : ["gini", "entropy"]
}

In [None]:
# Crear modelo
model = RandomForestClassifier(random_state=42)

# Crear modelo con GridSearchCV
b_model = GridSearchCV(model, parameters, cv=4, n_jobs=-1, verbose=1)
b_model

In [None]:
# División variables independientes y dependientes
X = df_originales['tokens']
Y = df_originales['sentimiento']

In [None]:
# Vectorización de texto
vectorizer = TfidfVectorizer(max_df=1200, min_df=2, max_features=10000, ngram_range=(1,2))
X_count = vectorizer.fit_transform(X)
print(X_count.shape)

(5000, 10000)


In [None]:
# Separación de datos en entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X_count, Y, test_size=0.15, random_state=42)

In [None]:
# Entrenar modelo
b_model.fit(X_train, Y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


In [None]:
# Predicción
Y_pred = b_model.predict(X_test)

In [None]:
# Evaluación del modelo
print('Accuracy: ', accuracy_score(Y_test, Y_pred))
print('F1: ', f1_score(Y_test, Y_pred, average='weighted'))
print('Precision: ', precision_score(Y_test, Y_pred, average='weighted'))
print('Recall: ', recall_score(Y_test, Y_pred, average='weighted'))

# Create a json with the previous values
model_json = {
    'accuracy': accuracy_score(Y_test, Y_pred),
    'f1': f1_score(Y_test, Y_pred, average='weighted'),
    'precision': precision_score(Y_test, Y_pred, average='weighted'),
    'recall': recall_score(Y_test, Y_pred, average='weighted')
}

# Save the json in a file
with open('./model_metrics.json', 'w') as outfile:
    json.dump(model_json, outfile)

Accuracy:  0.7893333333333333
F1:  0.7892973754643493
Precision:  0.7896658008658008
Recall:  0.7893333333333333


In [None]:
# Exportar modelo
joblib.dump(b_model, './modelo_random_forest.joblib')
# Exportar vectorizer
joblib.dump(vectorizer, './vectorizer.joblib')

['./vectorizer.joblib']