In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score

# Load the MovieLens 100K dataset
columns = ['userId', 'itemId', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=columns)

# Load user data
u_cols = ['userId', 'age', 'gender', 'ocupation', 'zip_code']
users = pd.read_csv('../data/ml-100k/u.user', sep='|', names=u_cols)

# Load movie data
i_cols = ['itemId', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

print("Ratings shape:", ratings.shape)
print("Users shape:", users.shape)
print("Items shape:", items.shape)
print("Ratings head:\n", ratings.head())
print("Users head:\n", users.head())
print("Items head:\n", items.head())

Ratings shape: (100000, 4)
Users shape: (943, 5)
Items shape: (1682, 24)
Ratings head:
    userId  itemId  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596
Users head:
    userId  age gender   ocupation zip_code
0       1   24      M  technician    85711
1       2   53      F       other    94043
2       3   23      M      writer    32067
3       4   24      M  technician    43537
4       5   33      F       other    15213
Items head:
    itemId        movie title release date  ...  Thriller War  Western
0       1   Toy Story (1995)  01-Jan-1995  ...         0   0        0
1       2   GoldenEye (1995)  01-Jan-1995  ...         1   0        0
2       3  Four Rooms (1995)  01-Jan-1995  ...         1   0        0
3       4  Get Shorty (1995)  01-Jan-1995  ...         0   0        0
4       5     Copycat (1995)  01-Jan-1995  ...         1

In [22]:
data = pd.merge(pd.merge(ratings, users, on='userId'), items, on='itemId')
print("Merged data shape:", data.shape)
print("Merged data head:\n", data.head())
print("Len of ratings:", len(ratings))

Merged data shape: (100000, 31)
Merged data head:
    userId  itemId  rating  timestamp  ...  Sci-Fi Thriller War Western
0     196     242       3  881250949  ...       0        0   0       0
1     186     302       3  891717742  ...       0        1   0       0
2      22     377       1  878887116  ...       0        0   0       0
3     244      51       2  880606923  ...       0        0   1       1
4     166     346       1  886397596  ...       0        0   0       0

[5 rows x 31 columns]
Len of ratings: 100000


In [23]:
# Feature of each user
# Average rating for each user
user_avg_rating = ratings.groupby('userId')['rating'].mean().rename('user_avg_rating')
data = data.merge(user_avg_rating, on='userId')

# Number of ratings for each user
user_rating_count = ratings.groupby('userId')['rating'].count().rename('user_rating_count')
data = data.merge(user_rating_count, on='userId')

# Feature of each item
# Average rating for each item
item_avg_rating = ratings.groupby('itemId')['rating'].mean().rename('item_avg_rating')
data = data.merge(item_avg_rating, on='itemId')

# Number of ratings for each item
item_rating_count = ratings.groupby('itemId')['rating'].count().rename('item_rating_count')
data = data.merge(item_rating_count, on='itemId')

# Convert categorical features to numerical
data['gender_numeric'] = data['gender'].apply(lambda x: 1 if x == 'M' else 0)

# Convert occupation to one-hot encoding
occupation_dummies = pd.get_dummies(data['ocupation'], prefix='occupation')
data = pd.concat([data, occupation_dummies], axis=1)

print("Data shape after feature engineering:", data.shape)
display(data.head())

Data shape after feature engineering: (100000, 57)


Unnamed: 0,userId,itemId,rating,timestamp,age,gender,ocupation,zip_code,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_avg_rating,user_rating_count,item_avg_rating,item_rating_count,gender_numeric,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,occupation_homemaker,occupation_lawyer,occupation_librarian,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.615385,39,3.991453,117,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,186,302,3,891717742,39,F,executive,0,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,3.413043,92,4.161616,297,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,22,377,1,878887116,25,M,writer,40206,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3.351562,128,2.153846,13,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,244,51,2,880606923,28,M,technician,80525,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,3.651261,238,3.45679,81,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,166,346,1,886397596,47,M,educator,55113,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,3.55,20,3.642857,126,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [24]:
feature_cols = [
    'age', 'gender_numeric', 
    'user_avg_rating', 'user_rating_count', 
    'item_avg_rating', 'item_rating_count'
]

genre_cols = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

occupation_cols = list(occupation_dummies.columns)

feature_cols += genre_cols + occupation_cols

X = data[feature_cols]
y = data['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (80000, 45)
X_test shape: (20000, 45)
y_train shape: (80000,)
y_test shape: (20000,)


In [25]:
# Regression with Random Forest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=20, min_samples_leaf=5)
rf_regressor.fit(X_train, y_train)

y_pred_reg = rf_regressor.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_reg))
print(f"\nBaseline - Random Forest Regressor RMSE: {rmse:.4f}")

# Compare with a simple baseline model
global_mean_prediction = np.full(y_test.shape, y_train.mean())
rmse_global_mean = np.sqrt(mean_squared_error(y_test, global_mean_prediction))
print(f"Baseline - Global Mean RMSE: {rmse_global_mean:.4f}")


Baseline - Random Forest Regressor RMSE: 0.9159
Baseline - Global Mean RMSE: 1.1239


In [28]:
y_class = (y >= 4).astype(int)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)
print("Distribution of new target in train set")
print(y_train_c.count())

Distribution of new target in train set
80000


In [30]:
from sklearn.metrics import classification_report
print("Đang huấn luyện mô hình Random Forest Classifier...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=20, min_samples_leaf=5)
rf_classifier.fit(X_train_c, y_train_c)

# Dự đoán
y_pred_class_rf = rf_classifier.predict(X_test_c)

# Đánh giá
print("\n--- Kết quả của Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_class_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_c, y_pred_class_rf, target_names=['Không thích', 'Thích']))

Đang huấn luyện mô hình Random Forest Classifier...

--- Kết quả của Random Forest Classifier ---
Accuracy: 0.7185

Classification Report:
              precision    recall  f1-score   support

 Không thích       0.72      0.62      0.66      9010
       Thích       0.72      0.80      0.76     10990

    accuracy                           0.72     20000
   macro avg       0.72      0.71      0.71     20000
weighted avg       0.72      0.72      0.72     20000



In [31]:
print("Đang huấn luyện mô hình Gaussian Naive Bayes...")
gnb = GaussianNB()
gnb.fit(X_train_c, y_train_c)

# Dự đoán
y_pred_class_gnb = gnb.predict(X_test_c)

# Đánh giá
print("\n--- Kết quả của Gaussian Naive Bayes ---")
print(f"Accuracy: {accuracy_score(y_test_c, y_pred_class_gnb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_c, y_pred_class_gnb, target_names=['Không thích', 'Thích']))

Đang huấn luyện mô hình Gaussian Naive Bayes...

--- Kết quả của Gaussian Naive Bayes ---
Accuracy: 0.6392

Classification Report:
              precision    recall  f1-score   support

 Không thích       0.60      0.60      0.60      9010
       Thích       0.67      0.67      0.67     10990

    accuracy                           0.64     20000
   macro avg       0.64      0.64      0.64     20000
weighted avg       0.64      0.64      0.64     20000

