In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_parquet('0000.parquet')

In [None]:
df = df[['id','review_body', 'stars', 'language','lenght_review_body']]

In [None]:
df = df.dropna(subset=['review_body'])

In [None]:
df['language'] = df['language'].str.lower().str[:2]
df = df[df['language'] == 'es']

In [None]:
df['review_body_clean'] = (
    df['review_body']
    .str.lower()
    .str.strip()
)
df = df.drop_duplicates(subset=['review_body_clean'])

In [None]:
df['review_body_clean'] = (
    df['review_body']
    .str.lower()
    .str.replace(r'http\S+', '', regex=True)
    .str.replace(r'[^a-záéíóúñü\s]', '', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

df = df.drop_duplicates(subset=['review_body_clean'])

In [None]:
def sentimiento(stars):
    if stars <= 2:
        return "NEGATIVO"
    elif stars == 3:
        return "NEUTRO"
    else:
        return "POSITIVO"

df['sentiment'] = df['stars'].apply(sentimiento)

In [None]:
df['sentiment'].value_counts(normalize=True)

In [None]:
df.sample(n=5000, random_state=42)

Utilizamos los subconjuntos creados por Raúl

In [None]:
df = pd.read_csv('df_500_fx.csv')

In [None]:
df2=pd.read_csv('df_5000_fx.csv')

In [None]:
df3=pd.read_csv('df_completo_fx.csv')

In [None]:
X_texto = df['review_body_clean']
y=df['sentiment']

In [None]:
X_texto3 = df3['review_body_clean']
y3=df3['sentiment']

In [None]:
X_texto.head()

Unnamed: 0,review_body_clean
0,hola el producto q he pedido no es el q me ha ...
1,no quita nada de nada
2,está bien de precio dentro de lo normal en est...
3,mi padre está muy contento tanto que seguramen...
4,entre canción y canción se pasa mucho tiempo h...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
VectorTexto=TfidfVectorizer(
    #max_features=300,
    ngram_range=(1,1),
    min_df=2
)
X_text_vec=VectorTexto.fit_transform(X_texto)

In [None]:
VectorTexto=TfidfVectorizer(
    #max_features=300,
    ngram_range=(1,1),
    min_df=2
)
X_text_vec3=VectorTexto.fit_transform(X_texto3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text_vec,
    y,
    test_size=0.2,
    stratify=y,      # mantiene proporción de clases de las distintas categorias de comentarios (Positivo, negativo, neutro)
    random_state=42
)

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X_text_vec3,
    y3,
    test_size=0.2,
    stratify=y3,      # mantiene proporción de clases de las distintas categorias de comentarios (Positivo, negativo, neutro)
    random_state=42
)

In [None]:
NaiveModel1=ComplementNB(
    alpha=1.0,
    fit_prior=True,
    class_prior=None
)

In [None]:
ComplementNB.fit(NaiveModel1,X_train,y_train)

In [None]:
ComplementNB.fit(NaiveModel1,X_train3,y_train3)

In [None]:
y_pred=ComplementNB.predict(NaiveModel1,X_test)
accuracy1=accuracy_score(y_test,y_pred)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    negativo       0.70      0.80      0.74        40
      neutro       0.19      0.15      0.17        20
    positivo       0.66      0.62      0.64        40

    accuracy                           0.60       100
   macro avg       0.51      0.53      0.52       100
weighted avg       0.58      0.60      0.59       100



In [None]:
y_pred3=ComplementNB.predict(NaiveModel1,X_test3)
accuracy3=accuracy_score(y_test3,y_pred3)
print(classification_report(y_test3,y_pred3))

              precision    recall  f1-score   support

    negativo       0.72      0.79      0.75     15840
      neutro       0.39      0.19      0.26      7903
    positivo       0.72      0.82      0.76     15665

    accuracy                           0.68     39408
   macro avg       0.61      0.60      0.59     39408
weighted avg       0.65      0.68      0.66     39408



In [None]:
report=classification_report(y_test,y_pred, output_dict=True)
print(report)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv('metricas_clasificacion.csv', index=True)

In [None]:
report3=classification_report(y_test3,y_pred3, output_dict=True)
print(report3)
report_df3 = pd.DataFrame(report3).transpose()
report_df3.to_csv('metricas_clasificacion3.csv', index=True)

{'negativo': {'precision': 0.7170467502850627, 'recall': 0.7940025252525252, 'f1-score': 0.7535650089874176, 'support': 15840.0}, 'neutro': {'precision': 0.3921618852459016, 'recall': 0.19372390231557637, 'f1-score': 0.25933768103667315, 'support': 7903.0}, 'positivo': {'precision': 0.7155978623914495, 'recall': 0.8206192148100862, 'f1-score': 0.7645187189628, 'support': 15665.0}, 'accuracy': 0.6842011774259034, 'macro avg': {'precision': 0.6082688326408046, 'recall': 0.6027818807927292, 'f1-score': 0.5924738029956302, 'support': 39408.0}, 'weighted avg': {'precision': 0.6513174080891142, 'recall': 0.6842011774259034, 'f1-score': 0.6588053483593632, 'support': 39408.0}}


In [None]:
report_df

Unnamed: 0,precision,recall,f1-score,support
negativo,0.695652,0.8,0.744186,40.0
neutro,0.1875,0.15,0.166667,20.0
positivo,0.657895,0.625,0.641026,40.0
accuracy,0.6,0.6,0.6,0.6
macro avg,0.513682,0.525,0.517293,100.0
weighted avg,0.578919,0.6,0.587418,100.0


In [None]:
def predecir_sentimiento(texto):
  sentimiento=ComplementNB.predict(NaiveModel1,VectorTexto.transform([texto]))
  return sentimiento[0]

In [None]:
#predecir_sentimiento("Me gustó mucho, altamente recomendable")
#predecir_sentimiento("Sin duda, la peor compra que he realizado")
predecir_sentimiento("Todo mal, baja calidad y pobre durabilidad")

np.str_('negativo')