In [2]:
# library import
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import warnings
import scipy

In [3]:
# read the three data sets
train = pd.read_csv("A2-Data_files/TMDB_train.csv")
evaluate = pd.read_csv("A2-Data_files/TMDB_evaluate.csv")
test = pd.read_csv("A2-Data_files/TMDB_test.csv")
unlabelled = pd.read_csv("A2-Data_files/TMDB_unlabelled.csv", low_memory=False)

def process_year(year):
    if pd.isna(year):
        return 'unknown'
    if isinstance(year, str) and len(year) == 4 and year.isdigit():
        return int(year)
    match = re.match(r'(\d{4})-\d{2}-\d{2}', str(year))
    if match:
        return int(match.group(1))
    try:
        return pd.to_datetime(year, format='%d/%m/%Y').year
    except:
        return 'unknown'

unlabelled['release_year'] = unlabelled['release_year'].apply(process_year)


In [4]:
# since some languages may not exist across three data sets, concatenate all the original_language columns
from sklearn.preprocessing import LabelEncoder
all_languages = pd.concat([train["original_language"], unlabelled["original_language"], evaluate["original_language"], test["original_language"]])
# all_languages = all_languages.fillna('unknown')
label_encoder = LabelEncoder()
label_encoder.fit(all_languages)
train['original_language'] = label_encoder.transform(train['original_language'])
unlabelled['original_language'] = label_encoder.transform(unlabelled['original_language'])
evaluate['original_language'] = label_encoder.transform(evaluate['original_language'])
test['original_language'] = label_encoder.transform(test['original_language'])

# Excluded Features: ['product_of_India', 'product_of_Japan']
important_features = ['release_year', 'runtime', 'budget', 'revenue', 'adult', 
                      'original_language', 'popularity', 'genre_Action', 'genre_Adventure', 
                      'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 
                      'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 
                      'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 
                      'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 
                      'genre_War', 'genre_Western', 'product_of_Canada', 'product_of_France', 
                      'product_of_Germany', 'product_of_India', 'product_of_Italy', 
                      'product_of_Japan', 'product_of_Spain', 'product_of_UK', 'product_of_USA', 
                      'product_of_other_countries', 'vote_count']

X_train = train[important_features]
y_train = train['rate_category']
X_unlabelled = unlabelled[important_features]
X_evaluate = evaluate[important_features]
y_evaluate = evaluate['rate_category']
X_test = test[important_features]


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# record original RF behaviour
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_evaluate)
print(classification_report(y_pred, y_evaluate, digits=3))

              precision    recall  f1-score   support

           0      0.697     0.671     0.684      2271
           1      0.628     0.705     0.664      2518
           2      0.714     0.653     0.682      5593
           3      0.746     0.709     0.727      5709
           4      0.624     0.758     0.684      2299
           5      0.670     0.689     0.680      1610

    accuracy                          0.692     20000
   macro avg      0.680     0.697     0.687     20000
weighted avg      0.696     0.692     0.693     20000



In [33]:
# use GridSearchCV to do hyperparameter tuning for DT
model = DecisionTreeClassifier(random_state=90049)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=0, scoring='accuracy')

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(grid_search.best_score_))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_evaluate)

print("Accuracy on Test Set: {:.2f}".format(accuracy_score(y_evaluate, y_pred)))
print(classification_report(y_evaluate, y_pred))


Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 6}
Best Cross-validation Score: 0.36
Accuracy on Test Set: 0.38
              precision    recall  f1-score   support

           0       0.31      0.47      0.37      2184
           1       0.40      0.14      0.20      2829
           2       0.35      0.53      0.42      5119
           3       0.45      0.42      0.44      5420
           4       0.43      0.17      0.24      2791
           5       0.35      0.39      0.37      1657

    accuracy                           0.38     20000
   macro avg       0.38      0.35      0.34     20000
weighted avg       0.39      0.38      0.36     20000



In [34]:
# original accuracy for DT
dt_original = DecisionTreeClassifier()
dt_original.fit(X_train, y_train)
y_pred = dt_original.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(accuracy)
y_pred = dt_original.predict(X_evaluate)
accuracy = accuracy_score(y_evaluate, y_pred)
print(accuracy)

0.99289
0.6553


In [35]:
# accuracy of DT after parameter adjustment
dt_improved = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=6)
dt_improved.fit(X_train, y_train)
y_pred = dt_improved.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(accuracy)
y_pred = dt_improved.predict(X_evaluate)
accuracy = accuracy_score(y_evaluate, y_pred)
print(accuracy)

0.39519
0.377


In [5]:
# use GridSearchCV to do hyperparameter tuning for RF
model = RandomForestClassifier(random_state=90049)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [0.25, 0.5, 0.75, None],
    'max_depth': [15, 20, 25, None],
    'min_samples_split': [5, 10, 15, 20],
    'min_samples_leaf': [2, 3, 4, 5]
}

warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1, n_jobs=8, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(grid_search.best_score_))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_evaluate)

print(classification_report(y_evaluate, y_pred))

Fitting 5 folds for each of 768 candidates, totalling 3840 fits




Best Parameters: {'max_depth': 20, 'max_features': 0.25, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Best Cross-validation Score: 0.40
              precision    recall  f1-score   support

           0       0.49      0.58      0.53      2184
           1       0.64      0.38      0.48      2829
           2       0.53      0.66      0.59      5119
           3       0.61      0.67      0.64      5420
           4       0.72      0.40      0.51      2791
           5       0.51      0.54      0.53      1657

    accuracy                           0.57     20000
   macro avg       0.58      0.54      0.55     20000
weighted avg       0.59      0.57      0.56     20000



In [9]:
# accuracy of RF after parameter adjustment
rf_improved = RandomForestClassifier(random_state=90049,max_depth=20,max_features=0.25,min_samples_leaf=2,min_samples_split=5,n_estimators=500,bootstrap=False)
rf_improved.fit(X_train, y_train)
y_pred = rf_improved.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(accuracy)
y_pred = rf_improved.predict(X_evaluate)
accuracy = accuracy_score(y_evaluate, y_pred)
print(accuracy)

0.79665
0.6017


max_depth=20,max_features='sqrt',min_samples_leaf=2,min_samples_split=5,n_estimators=200
0.66677
0.5383
max_depth=20,max_features=0.25,min_samples_leaf=2,min_samples_split=5,n_estimators=300
0.73448
0.56965
random_state=90049,max_depth=20,max_features=0.25,min_samples_leaf=2,min_samples_split=5,n_estimators=500,bootstrap=False
0.79665
0.6017
all default
0.99289
0.6913

In [47]:
# original behaviour of RF
rf_original = RandomForestClassifier(random_state=90049)
rf_original.fit(X_train, y_train)
y_pred = rf_original.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(accuracy)
y_pred = rf_original.predict(X_evaluate)
accuracy = accuracy_score(y_evaluate, y_pred)
print(accuracy)

0.99289
0.6913
