In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import gc
import joblib

movies_df = pd.read_csv("Movies.csv", encoding='latin1')
ratings_df = pd.read_csv("Ratings.csv", encoding='latin1')
users_df = pd.read_csv("Users.csv", encoding='latin1')

movies_df = movies_df.loc[:, ~movies_df.columns.str.contains('^Unnamed')]
movies_df['Year'] = movies_df['Title'].str.extract(r'\((\d{4})\)').astype('Int64')
movies_df['Category'] = movies_df['Category'].str.split('|')
movies_df = movies_df.explode('Category')

merged_df = pd.merge(ratings_df, movies_df, on='MovieID', how='left')
merged_df = pd.merge(merged_df, users_df, on='UserID', how='left')
merged_df = merged_df.dropna(subset=['Year', 'Category'])

age_map = {1: 'Under 18', 18: '18-24', 25: '25-34', 35: '35-44', 45: '45-49', 50: '50-55', 56: '56+'}
merged_df['AgeGroup'] = merged_df['Age'].map(age_map)

occupation_map = {
    0: "Not specified or other", 1: "Academician", 2: "Artist", 3: "Admin/Office work",
    4: "Grad/Higher Ed student", 5: "Customer Service/Consultant", 6: "Doctor and Medical services",
    7: "Executive and Managerial", 8: "Farmer and Agriculture", 9: "Homemaker", 10: "K-12 Student",
    11: "Lawyer", 12: "Programmer", 13: "Retired", 14: "Sales and Marketing", 15: "Scientist",
    16: "Self-Employed", 17: "Engineer and Technician", 18: "Tradesman/Craftsman", 19: "Unemployed", 20: "Writer"
}
merged_df['OccupationName'] = merged_df['Occupation'].map(occupation_map)

df = merged_df[['Occupation', 'Age', 'Category']].copy()
df = df.dropna(subset=['Category'])
df['Occupation'] = df['Occupation'].astype('category')
df['Age'] = df['Age'].astype('int16')
df['Occupation'] = df['Occupation'].cat.codes.astype('int16')

le = LabelEncoder()
df['CategoryLabel'] = le.fit_transform(df['Category'])
X = df[['Occupation', 'Age']]
y = df['CategoryLabel']

gc.collect()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

model = RandomForestClassifier(
    n_estimators=50,
    max_depth=15,
    min_samples_split=10,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

joblib.dump(model, 'rf_movie_category_model.pkl')
joblib.dump(le, 'category_label_encoder.pkl')
joblib.dump(le.classes_, 'category_label_encoder_classes.pkl')

print("✅ Model, encoder, and encoder classes saved successfully!")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.9s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    3.8s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.16      0.01      0.03    116418
           1       0.00      0.00      0.00     57898
           2       0.00      0.00      0.00     20336
           3       0.00      0.00      0.00     34413
           4       0.18      0.56      0.27    173204
           5       0.00      0.00      0.00     38257
           6       0.00      0.00      0.00      3509
           7       0.19      0.50      0.28    172665
           8       0.09      0.00      0.00     16534
           9       0.00      0.00      0.00      9130
          10       0.17      0.00      0.00     35095
          11       0.00      0.00      0.00     20701
          12       0.00      0.00      0.00     18651
          13       0.00      0.00      0.00     71846
          14       0.17      0.00      0.00     66901
          15       0.12      0.00      0.00     91449
          16       0.00      0.00      0.00     30002
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


    Year  MovieCount
0   1919           3
1   1920           2
2   1921           1
3   1922           2
4   1923           3
..   ...         ...
76  1996         516
77  1997         508
78  1998         548
79  1999         414
80  2000         235

[81 rows x 2 columns]


    Year  MovieCount
0   1919           3
1   1920           2
2   1921           1
3   1922           2
4   1923           3
..   ...         ...
76  1996         516
77  1997         508
78  1998         548
79  1999         414
80  2000         235

[81 rows x 2 columns]


In [22]:
# Group by Category + Age + Occupation
cat_target = merged.groupby(['Category', 'Age', 'Occupation'])['Rating'].count().reset_index(name='Likes')

# For UI model: When user enters Category → Recommend Age+Occupation
# top_likers = cat_target.sort_values(by='Likes', ascending=False).groupby('Category').head(1)

# print("\nFor each Category → Most likely Age Group + Occupation:\n")
# print(top_likers[['Category', 'Age', 'Occupation', 'Likes']])
