In [None]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import shap

In [None]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv', usecols = ['userId', 'movieId', 'rating'])
tags = pd.read_csv('ml-latest-small/tags.csv', usecols = ['userId', 'movieId', 'tag'])
links = pd.read_csv('ml-latest-small/links.csv')

In [None]:
g = {i:lambda x: '|'.join(list(x)) for i in tags.columns[2:]}
tags = tags.groupby(['userId', 'movieId']).agg(g).reset_index()
movies = pd.merge(movies, links, on=['movieId'])
reviews = pd.merge(ratings, tags, on=['userId', 'movieId'])

In [None]:
print("Movies head:\n", movies.head())
print("Reviews head:\n", reviews.head())

In [None]:
ratings_df = reviews.groupby(by=['rating']).agg({'userId': 'count'}).rename(columns={'userId': 'count'}).reset_index()
plt.bar(ratings_df['rating'], ratings_df['count'], align='center', width=0.3)
plt.xticks(ratings_df['rating'])
plt.title("Rating vs Count")
plt.xlabel("Rating")
plt.ylabel("Count")

In [None]:
display(movies)

In [None]:
data = pd.merge(ratings, tags, on=['userId', 'movieId'], how='left')
data = pd.merge(data, movies, on='movieId', how='left')
data = data.dropna()

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
data['tag'] = data['tag'].fillna('')
data['tags_combined'] = data.groupby('movieId')['tag'].transform(lambda x: ' '.join(x))
X_tags = vectorizer.fit_transform(data['tags_combined']).toarray()

In [None]:
genres = data['genres'].str.get_dummies('|')

In [None]:
X = np.hstack((X_tags, genres.values))

In [None]:
y = data['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(),
    'LinearRegression': LinearRegression()
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
    print(f"R^2 Score: {r2_score(y_test, y_pred)}")

In [None]:
model = models['RandomForest']
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=vectorizer.get_feature_names_out().tolist() + list(genres.columns)).sort_values(ascending=False)

In [None]:
top_features = feature_importances.head(10)

In [None]:
print(top_features)

In [None]:
plt.figure(figsize=(10, 6))
top_features.plot(kind='barh')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Most Important Features for Predicting Movie Ratings')
plt.show()

In [None]:
def prepare_features(new_movie_tags, new_movie_genres, tfidf, genre_columns):

    tag_features = tfidf.transform([new_movie_tags]).toarray()
    

    genres = new_movie_genres.split('|')
    genre_features = pd.DataFrame(columns=genre_columns, data=[[1 if genre in genres else 0 for genre in genre_columns]])
    

    features = np.hstack((tag_features, genre_features.values))
    return features

In [None]:
new_movie_tags = "action packed, thrilling"
new_movie_genres = "Action|Adventure|Sci-Fi"

In [None]:
new_movie_features = prepare_features(new_movie_tags, new_movie_genres, vectorizer, genres.columns)

predicted_rating = model.predict(new_movie_features)
print(f'Predicted Rating: {predicted_rating[0]}')

In [None]:
explainer = shap.TreeExplainer(model, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test[0])