In [None]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import shap

In [None]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv', usecols = ['userId', 'movieId', 'rating'])
tags = pd.read_csv('ml-latest-small/tags.csv', usecols = ['userId', 'movieId', 'tag'])
links = pd.read_csv('ml-latest-small/links.csv')

In [None]:
g = {i:lambda x: '|'.join(list(x)) for i in tags.columns[2:]}
tags = tags.groupby(['userId', 'movieId']).agg(g).reset_index()
movies = pd.merge(movies, links, on=['movieId'])
reviews = pd.merge(ratings, tags, on=['userId', 'movieId'])

In [None]:
print("Movies head:\n", movies.head())
print("Reviews head:\n", reviews.head())

In [None]:
ratings_df = reviews.groupby(by=['rating']).agg({'userId': 'count'}).rename(columns={'userId': 'count'}).reset_index()
plt.bar(ratings_df['rating'], ratings_df['count'], align='center', width=0.3)
plt.xticks(ratings_df['rating'])
plt.title("Rating vs Count")
plt.xlabel("Rating")
plt.ylabel("Count")

In [None]:
display(movies)

In [None]:
vectorizer = TfidfVectorizer()
tag_matrix = vectorizer.fit_transform(reviews['tag'].fillna(''))

X = tag_matrix.toarray()
y = reviews['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(),
    'LinearRegression': LinearRegression()
}

In [None]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R²': r2}
    print(f'{name} - MSE: {mse}, R²: {r2}')

In [None]:
model = models['RandomForest']

In [None]:
def prepare_features(new_movie_tags, tfidf):
    tag_features = tfidf.transform([new_movie_tags]).toarray()
    return tag_features

In [None]:
new_movie_tags = ["action packed, thrilling", "romantic, comedy", "suspenseful, mysterious"]
new_movie_features = [prepare_features(tags, vectorizer) for tags in new_movie_tags]

In [None]:
for tags, features in zip(new_movie_tags, new_movie_features):
    predicted_rating = best_model.predict(features)
    print(f'Predicted Rating for "{tags}": {predicted_rating[0]}')

In [None]:
ratings_df = reviews.groupby(by=['rating']).agg({'userId': 'count'}).rename(columns={'userId': 'count'}).reset_index()
plt.bar(ratings_df['rating'], ratings_df['count'], align='center', width=0.3)
plt.xticks(ratings_df['rating'])
plt.title("Rating vs Count")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()

In [None]:
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=vectorizer.get_feature_names_out())

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0])