In [2]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import itertools
from dataset import TMDBDataset

In [3]:
overview_cast_df = TMDBDataset(
    root = "./tmp",
    node_feature_method = "counter",
    node_feature_params = {'min_df': 0.1},
    node_feature_column_source = "keywords",
    add_additional_node_features = True,
    edge_weight_column_source = "cast",
    jaccard_distance_threshold = 0,
    graph_type = "homogenous"
)
overview_cast_df.y = np.log(overview_cast_df.y)

In [4]:
keywords_cast_df = TMDBDataset(
    root = "./tmp",
    node_feature_method = "counter",
    node_feature_params = {'min_df': 0.015},
    node_feature_column_source = "keywords",
    add_additional_node_features = True,
    edge_weight_column_source = "cast",
    jaccard_distance_threshold = 0,
    graph_type = "homogenous"
)
keywords_cast_df.y = np.log(keywords_cast_df.y)

In [5]:
df = TMDBDataset(
    root="./tmp",
    node_feature_method="counter",
    node_feature_params=0.015,
    node_feature_column_source="keywords",
    add_additional_node_features=True,
    edge_weight_column_source="cast",
    jaccard_distance_threshold=0,
    graph_type="heterogeneous",
)

In [6]:
datasets = {
    "overview": overview_cast_df,
    "keywords": keywords_cast_df,
}

In [38]:
def train_baseline_models(dataset): 
    results = []
    X_train, X_test, y_train, y_test = train_test_split(dataset.x.numpy(), dataset.y.numpy().ravel(), test_size=0.2, random_state=42)
    models = {
        "LinearRegression": LinearRegression,
        "RandomForestRegressor": RandomForestRegressor,
        "XGBRegressor": XGBRegressor,
        "MLPRegressor": MLPRegressor
    }

    params = {
        "LinearRegression": {"fit_intercept": [True, False]},
        "RandomForestRegressor": {"max_depth": [3, 5, 8, 10], "max_features": [0.5, 0.7, 0.9]},
        "XGBRegressor": {"max_depth": [3, 5, 8, 10]},
        "MLPRegressor": {"hidden_layer_sizes": [(100, 50), (100, 50, 25), (100,)]}
    }

    for name, model_class in models.items():
        model_params = params[name]
        keys, values = zip(*model_params.items())
        combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
        for combination in combinations:
            model = model_class(**combination)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mse_train = mean_squared_error(y_train, model.predict(X_train))
            mse_test = mean_squared_error(y_test, y_pred)
            results.append({"model": name, "mse_train": mse_train, "mse_test": mse_test, "params": combination})
    return pd.DataFrame(results)

In [39]:
overview_baseline = train_baseline_models(datasets["overview"])



In [None]:
overview_baseline.to_csv("results/overview_baseline.csv", index=False)

In [124]:
overview_baseline.sort_values(by="mse_test").head(10)

Unnamed: 0,model,mse_train,mse_test
14,XGBRegressor,0.48965,1.06616
11,RandomForestRegressor,0.468445,1.087571
12,RandomForestRegressor,0.458935,1.09561
13,RandomForestRegressor,0.436927,1.095678
9,RandomForestRegressor,0.594623,1.105106
10,RandomForestRegressor,0.574871,1.105459
0,LinearRegression,0.886038,1.106514
8,RandomForestRegressor,0.61296,1.106828
5,RandomForestRegressor,0.831309,1.117576
6,RandomForestRegressor,0.818969,1.140131


In [22]:
overview_baseline["node_features"] = "overview"

In [23]:
overview_baseline[overview_baseline["model"] == "MLPRegressor"].sort_values(by="mse_test")

Unnamed: 0,model,mse_train,mse_test,node_features
18,MLPRegressor,0.352542,1.352447,overview
20,MLPRegressor,1.209724,1.426175,overview
19,MLPRegressor,0.15244,1.65706,overview


In [None]:
keywords_baseline = train_baseline_models(datasets["keywords"])

In [129]:
keywords_baseline.to_csv("results/keywords_baseline.csv")

Unnamed: 0,model,mse_train,mse_test
14,XGBRegressor,0.48965,1.06616
13,RandomForestRegressor,0.441882,1.082193
11,RandomForestRegressor,0.472802,1.084344
8,RandomForestRegressor,0.611624,1.093165
0,LinearRegression,0.886038,1.106514
12,RandomForestRegressor,0.450131,1.108224
9,RandomForestRegressor,0.58726,1.113659
10,RandomForestRegressor,0.580117,1.118552
5,RandomForestRegressor,0.823548,1.129652
6,RandomForestRegressor,0.819434,1.133096


In [None]:
keywords_baseline.sort_values(by="mse_test").head(10)

In [24]:
keywords_baseline["node_features"] = "keywords"

In [12]:
keywords_baseline[keywords_baseline["model"] == "MLPRegressor"].sort_values(by="mse_test")

Unnamed: 0.1,Unnamed: 0,model,mse_train,mse_test
18,18,MLPRegressor,0.429362,1.316708
20,20,MLPRegressor,1.164775,1.38964
19,19,MLPRegressor,0.220019,1.516744


In [26]:
baseline_results = pd.concat([overview_baseline, keywords_baseline])

In [27]:
baseline_results.to_csv("results/baseline_results.csv", index=False)

In [8]:
baseline_results.groupby(["model", "node_features"]).agg({"mse_test": "min"})\
    .reset_index()\
    .merge(baseline_results, on=['model',"mse_test",  "node_features"])\
    .sort_values(by="mse_test")[["model", "node_features", "mse_train", "mse_test"]]

Unnamed: 0,model,node_features,mse_train,mse_test
6,XGBRegressor,keywords,0.48965,1.06616
7,XGBRegressor,overview,0.48965,1.06616
4,RandomForestRegressor,keywords,0.441882,1.082193
5,RandomForestRegressor,overview,0.468445,1.087571
0,LinearRegression,keywords,0.886038,1.106514
1,LinearRegression,overview,0.886038,1.106514
2,MLPRegressor,keywords,0.429362,1.316708
3,MLPRegressor,overview,0.352542,1.352447
