In [81]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [83]:
data = pd.read_csv(r'C:\Users\andre\OneDrive - Universidad Rafael Landivar\PRIMER CICLO 2024\IA\PROYECTO\archive\rotten_tomatoes_critic_reviews.csv')
print("Datos cargados correctamente")

Datos cargados correctamente


In [84]:
print(data['review_content'].isnull().sum()) 

65806


In [85]:
# Normalizar los datos
data['review_type'] = data['review_type'].apply(lambda x: 1 if x == 'Fresh' else 0)
data = data.dropna(subset=['review_content'])
print(data.head())

  rotten_tomatoes_link      critic_name  top_critic           publisher_name  \
0            m/0814255  Andrew L. Urban       False           Urban Cinefile   
1            m/0814255    Louise Keller       False           Urban Cinefile   
2            m/0814255              NaN       False      FILMINK (Australia)   
3            m/0814255     Ben McEachen       False  Sunday Mail (Australia)   
4            m/0814255      Ethan Alter        True       Hollywood Reporter   

   review_type review_score review_date  \
0            1          NaN  2010-02-06   
1            1          NaN  2010-02-06   
2            1          NaN  2010-02-09   
3            1        3.5/5  2010-02-09   
4            0          NaN  2010-02-10   

                                      review_content  
0  A fantasy adventure that fuses Greek mythology...  
1  Uma Thurman as Medusa, the gorgon with a coiff...  
2  With a top-notch cast and dazzling special eff...  
3  Whether audiences will get behind The

In [86]:
print(data['review_content'].isnull().sum()) 

0


In [87]:
# Preparación de datos
X = data['review_content']
y = data['review_type']

In [88]:
# Convertir las reviews en una representación numérica
vectorizer = TfidfVectorizer(stop_words='english')

In [89]:
X = vectorizer.fit_transform(X)

In [90]:
# Dividir el dataset en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
# modelo
class NaiveBayes:
    def fit(self, X, y):
        n_docs, n_features = X.shape
        
        # Contar las ocurrencias de cada clase
        self.class_counts = np.bincount(y)
        self.n_classes = len(self.class_counts)
        
        # Contar las ocurrencias de características por clase
        self.feature_counts = np.zeros((self.n_classes, n_features))
        for c in range(self.n_classes):
            self.feature_counts[c, :] = X[y == c].sum(axis=0)
        
        # Calcular probabilidades logaritmicas
        self.class_log_prior = np.log(self.class_counts) - np.log(n_docs)
        self.feature_log_prob = np.log(self.feature_counts + 1) - np.log(self.feature_counts.sum(axis=1)[:, np.newaxis] + n_features)
    
    def predict(self, X):
        return np.argmax(self.predict_log(X), axis=1)
    
    def predict_log(self, X):
        return X @ self.feature_log_prob.T + self.class_log_prior

In [95]:
# Entrenar el modelo
model = NaiveBayes()
model.fit(X_train, y_train)

In [96]:
# Evaluar el modelo
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7834460141982588
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.51      0.63     76483
           1       0.77      0.94      0.85    136360

    accuracy                           0.78    212843
   macro avg       0.80      0.72      0.74    212843
weighted avg       0.79      0.78      0.77    212843



In [97]:
def classify_review(review):
    review_transformed = vectorizer.transform([review])
    prediction = model.predict(review_transformed)
    return "Fresh" if prediction[0] == 1 else "Rotten"

# Ejemplo de uso
new_review = "The movie was absolutely fantastic, I loved every moment of it!"
print("Review:", new_review)
print("Classification:", classify_review(new_review))

new_review = "The movie was terrible, I hated it and it was a waste of time."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

new_review = "Too many long, very pauses. The Story was slow. Very dissapointed"
print("Review:", new_review)
print("Classification:", classify_review(new_review))


Review: The movie was absolutely fantastic, I loved every moment of it!
Classification: Fresh
Review: The movie was terrible, I hated it and it was a waste of time.
Classification: Rotten
Review: Too many long, very pauses. The Story was slow. Very dissapointed
Classification: Fresh


In [98]:
new_review = "Such an original and hearfelt movie! Easily best picture of the year!"
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: Such an original and hearfelt movie! Easily best picture of the year!
Classification: Fresh


In [99]:
new_review = "Nicely done film. Lee is excellent as the reviews have said. There are some absolutely perfectly staged shots as well."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: Nicely done film. Lee is excellent as the reviews have said. There are some absolutely perfectly staged shots as well.
Classification: Fresh


In [100]:
new_review = "Poor writing undermines an otherwise interesting story. Too many bad choices in the writing that just don't add up and definitely would not hold up to repeated viewings."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: Poor writing undermines an otherwise interesting story. Too many bad choices in the writing that just don't add up and definitely would not hold up to repeated viewings.
Classification: Rotten


In [101]:
new_review = "Tried but just too slow and boring. There may have been a crumb of a good story somewhere in there, but I couldn't stay away through the second episode, always a bad sign for a new series."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: Tried but just too slow and boring. There may have been a crumb of a good story somewhere in there, but I couldn't stay away through the second episode, always a bad sign for a new series.
Classification: Rotten


In [102]:
new_review = "So boring and slow. I fell asleep halfway through second episode."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: So boring and slow. I fell asleep halfway through second episode.
Classification: Rotten


In [103]:
new_review = "Two episodes in, so far pretty good."
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: Two episodes in, so far pretty good.
Classification: Fresh


In [104]:
new_review = "I dont know why the low rating, this show is phenomenal. Its depth and the acting are superb. Excited to see where the show goes!!!"
print("Review:", new_review)
print("Classification:", classify_review(new_review))

Review: I dont know why the low rating, this show is phenomenal. Its depth and the acting are superb. Excited to see where the show goes!!!
Classification: Fresh
