In [None]:
# library import
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import warnings
import scipy

In [None]:
# read the three data sets
train = pd.read_csv("A2-Data_files/TMDB_train.csv")
evaluate = pd.read_csv("A2-Data_files/TMDB_evaluate.csv")
test = pd.read_csv("A2-Data_files/TMDB_test.csv")
unlabelled = pd.read_csv("A2-Data_files/TMDB_unlabelled.csv", low_memory=False)

def process_year(year):
    if pd.isna(year):
        return 'unknown'
    if isinstance(year, str) and len(year) == 4 and year.isdigit():
        return int(year)
    match = re.match(r'(\d{4})-\d{2}-\d{2}', str(year))
    if match:
        return int(match.group(1))
    try:
        return pd.to_datetime(year, format='%d/%m/%Y').year
    except:
        return 'unknown'

unlabelled['release_year'] = unlabelled['release_year'].apply(process_year)


In [None]:
# since some languages may not exist across three data sets, concatenate all the original_language columns
from sklearn.preprocessing import LabelEncoder
all_languages = pd.concat([train["original_language"], unlabelled["original_language"], evaluate["original_language"], test["original_language"]])
# all_languages = all_languages.fillna('unknown')
label_encoder = LabelEncoder()
label_encoder.fit(all_languages)
train['original_language'] = label_encoder.transform(train['original_language'])
unlabelled['original_language'] = label_encoder.transform(unlabelled['original_language'])
evaluate['original_language'] = label_encoder.transform(evaluate['original_language'])
test['original_language'] = label_encoder.transform(test['original_language'])

# Excluded Features: ['product_of_India', 'product_of_Japan']
important_features = ['release_year', 'runtime', 'budget', 'revenue', 'adult', 
                      'original_language', 'popularity', 'genre_Action', 'genre_Adventure', 
                      'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 
                      'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 
                      'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 
                      'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 
                      'genre_War', 'genre_Western', 'product_of_Canada', 'product_of_France', 
                      'product_of_Germany', 'product_of_India', 'product_of_Italy', 
                      'product_of_Japan', 'product_of_Spain', 'product_of_UK', 'product_of_USA', 
                      'product_of_other_countries', 'vote_count']

X_train = train[important_features]
y_train = train['rate_category']
X_unlabelled = unlabelled[important_features]
X_evaluate = evaluate[important_features]
y_evaluate = evaluate['rate_category']
X_test = test[important_features]


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X_train_production_companies = scipy.sparse.load_npz('A2-Data_files/TMDB_text_features_bow/train_production_companies_bow.npz')
X_train_title = scipy.sparse.load_npz('A2-Data_files/TMDB_text_features_bow/train_title_bow.npz')
X_evaluate_production_companies = scipy.sparse.load_npz('A2-Data_files/TMDB_text_features_bow/eval_production_companies_bow.npz')
X_evaluate_title = scipy.sparse.load_npz('A2-Data_files/TMDB_text_features_bow/eval_title_bow.npz')
X_train_with_text = np.concatenate((X_train.to_numpy(), X_train_production_companies.toarray(), X_train_title.toarray()), axis=1)
X_evaluate_with_text = np.concatenate((X_evaluate.to_numpy(), X_evaluate_production_companies.toarray(), X_evaluate_title.toarray()), axis=1)

model = RandomForestClassifier(random_state=90049)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 0.25, 0.5],
    'max_depth': [20, 25, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 3, 5]
}

warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
# 创建 GridSearchCV 对象
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=0, n_jobs=-1, scoring='accuracy')

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 最佳参数和最佳得分
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(grid_search.best_score_))

# 使用最佳模型在测试集上进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_evaluate)

# 评估并打印性能
print(classification_report(y_evaluate, y_pred))
