In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
df = pd.read_csv("downloads/sample_movie_reviews_200.csv")

# Simple stopwords list
basic_stopwords = set([
    "the", "a", "an", "is", "are", "was", "were", "and", "or", "in", "on", "at", 
    "for", "with", "of", "to", "from", "this", "that", "it", "as", "be", "by"
])

# Text cleaning
def clean_text(text):
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in basic_stopwords]
    return " ".join(tokens)

df['cleaned_review'] = df['review'].apply(clean_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results[name] = {"MSE": mse, "R^2 Score": r2}

# Display results
results_df = pd.DataFrame(results).T.sort_values(by="R^2 Score", ascending=False)
print(results_df)

                              MSE  R^2 Score
Random Forest Regressor  1.025289   0.900457
Linear Regression        1.026637   0.900327
Ridge Regression         1.175548   0.885869
