In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [76]:
genre_cols = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [77]:
movies = pd.read_csv('/Users/alhasan/Documents/Uneeq Interns/Movie Recommendation System/ml-100k/u.item',
                     sep='|', encoding='latin-1', header=None,
                     usecols=[0, 1] + list(range(5, 24)),
                     names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + genre_cols)

In [78]:
ratings = pd.read_csv('/Users/alhasan/Documents/Uneeq Interns/Movie Recommendation System/ml-100k/u.data',
                      sep='\t', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [79]:
data = pd.merge(ratings, movies, on='movie_id')

In [80]:
import re

def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

movies['year'] = movies['title'].apply(extract_year)

In [81]:
movie_stats = data.groupby('movie_id').agg({
    'rating': ['mean', 'count']
}).reset_index()

movie_stats.columns = ['movie_id', 'avg_rating', 'rating_count']

movies = pd.merge(movies, movie_stats, on='movie_id')

In [82]:
# Keep only useful columns
movies = movies[[
    'movie_id', 'title', 'year', 'avg_rating', 'rating_count'
] + genre_cols]

In [83]:
movies.head()

Unnamed: 0,movie_id,title,year,avg_rating,rating_count,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),1995.0,3.878319,452,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1995.0,3.206107,131,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),1995.0,3.033333,90,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1995.0,3.550239,209,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),1995.0,3.302326,86,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [84]:
movies.isna().sum()

movie_id        0
title           0
year            1
avg_rating      0
rating_count    0
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64

In [86]:
movies = movies.dropna()

In [87]:
movies.to_csv("your_preprocessed_movies.csv", index=False)

In [89]:
movies['like'] = (movies['avg_rating'] >= 4).astype(int)

features = ['year', 'avg_rating', 'rating_count'] + genre_cols
X = movies[features]
y = movies['like']


In [90]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       291
           1       1.00      1.00      1.00        46

    accuracy                           1.00       337
   macro avg       1.00      1.00      1.00       337
weighted avg       1.00      1.00      1.00       337



In [91]:
import joblib

joblib.dump(clf, 'movie_recommender_model.pkl')


['movie_recommender_model.pkl']