In [2]:
import pandas as pd
import pickle
from src.data_prep import load_data
from src.model import train_data, dev_data, test_data, drop_unused

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [3]:
# import data
dev = load_data("../data/raw/dev.csv")
test = load_data("../data/raw/test.csv")
train = load_data("../data/raw/train.csv")

In [4]:
level_seq_eng = pickle.load(open("level_seq_eng.pkl", "rb"))

In [5]:
# train data
train_df = train_data(level_seq_eng, train)
train_df

Unnamed: 0,user_id,max_level,user_success_rate,user_avg_duration,level_avg_reststep,total_used_help,user_help_rate,most_played_day,most_played_hour,label
2773,2774,134,0.632558,118.130233,0.189056,18,0.083721,1,0,0.0
2774,2775,116,0.738739,169.720721,0.258456,14,0.126126,1,1,0.0
2775,2776,123,0.637681,88.681159,0.186543,1,0.014493,1,10,1.0
2776,2777,164,0.506993,142.685315,0.124245,4,0.013986,2,19,0.0
2777,2778,122,0.672840,197.808642,0.299450,9,0.055556,1,12,1.0
...,...,...,...,...,...,...,...,...,...,...
10926,10927,207,0.505714,99.134286,0.166471,10,0.028571,2,17,1.0
10927,10928,48,1.000000,81.460000,0.471203,3,0.060000,1,22,1.0
10928,10929,122,0.469136,118.757202,0.117959,14,0.057613,2,0,0.0
10929,10930,39,0.948718,156.923077,0.380187,6,0.153846,2,0,1.0


In [6]:
# dev data
dev_df = dev_data(level_seq_eng, dev)
dev_df

Unnamed: 0,user_id,max_level,user_success_rate,user_avg_duration,level_avg_reststep,total_used_help,user_help_rate,most_played_day,most_played_hour,label
10931,10932,115,0.504505,185.342342,0.143857,20,0.090090,1,1,0.0
10932,10933,57,0.604651,110.255814,0.189069,4,0.046512,1,0,1.0
10933,10934,116,0.731959,110.381443,0.265445,7,0.072165,1,0,0.0
10934,10935,18,0.750000,90.550000,0.339879,2,0.100000,2,11,1.0
10935,10936,133,0.524017,145.362445,0.115141,22,0.096070,1,17,0.0
...,...,...,...,...,...,...,...,...,...,...
13584,13585,135,0.451613,109.826613,0.119008,12,0.048387,2,20,0.0
13585,13586,179,0.446945,125.144695,0.108392,16,0.051447,2,13,0.0
13586,13587,111,0.700000,109.750000,0.228793,0,0.000000,4,15,0.0
13587,13588,27,0.964286,75.642857,0.441202,0,0.000000,2,0,1.0


In [7]:
# test data
test_df = test_data(level_seq_eng, test)
test_df

Unnamed: 0,user_id,max_level,user_success_rate,user_avg_duration,level_avg_reststep,total_used_help,user_help_rate,most_played_day,most_played_hour
0,1,122,0.263291,98.379747,0.060526,8,0.020253,1,12
1,2,170,0.514768,85.189873,0.150546,20,0.084388,1,22
2,3,186,0.608696,96.917391,0.235325,14,0.060870,1,0
3,4,178,0.532710,123.682243,0.143748,8,0.074766,1,6
4,5,123,0.420168,123.756303,0.118816,20,0.084034,1,0
...,...,...,...,...,...,...,...,...,...
2768,2769,37,0.829268,80.341463,0.324696,3,0.073171,2,4
2769,2770,311,0.501217,101.158151,0.147572,18,0.043796,2,0
2770,2771,312,0.701961,95.400000,0.198157,15,0.058824,2,0
2771,2772,57,0.632184,119.908046,0.211336,1,0.011494,2,0


In [8]:
# control train, test, dev data
total_rows = train_df.shape[0] + dev_df.shape[0] + test_df.shape[0]
is_data_consistent = level_seq_eng.shape[0] == total_rows

print(f"✅ Data consistency check: {is_data_consistent}")
print(f"📊 Total rows in level_seq: {level_seq_eng.shape[0]}")
print(f"📊 Sum of train, dev, and test rows: {total_rows}")


✅ Data consistency check: True
📊 Total rows in level_seq: 13589
📊 Sum of train, dev, and test rows: 13589


In [9]:
# drop unuseful column for model
train_df = drop_unused(train_df, "user_id")
dev_df = drop_unused(dev_df, "user_id")
test_df = drop_unused(test_df, "user_id")

In [10]:
test_df

Unnamed: 0,max_level,user_success_rate,user_avg_duration,level_avg_reststep,total_used_help,user_help_rate,most_played_day,most_played_hour
0,122,0.263291,98.379747,0.060526,8,0.020253,1,12
1,170,0.514768,85.189873,0.150546,20,0.084388,1,22
2,186,0.608696,96.917391,0.235325,14,0.060870,1,0
3,178,0.532710,123.682243,0.143748,8,0.074766,1,6
4,123,0.420168,123.756303,0.118816,20,0.084034,1,0
...,...,...,...,...,...,...,...,...
2768,37,0.829268,80.341463,0.324696,3,0.073171,2,4
2769,311,0.501217,101.158151,0.147572,18,0.043796,2,0
2770,312,0.701961,95.400000,0.198157,15,0.058824,2,0
2771,57,0.632184,119.908046,0.211336,1,0.011494,2,0


In [11]:
# Define train label and features
y_train = train_df["label"]
X_train = train_df.drop(columns=["label"])

# Define test data
X_test = dev_df.drop(columns=["label"])
y_test = dev_df["label"]

In [12]:
# Create RandomForest model
rf = RandomForestClassifier()

# fit model
rf.fit(X_train, y_train)

In [13]:
# predict 
y_pred = rf.predict(X_test)

In [14]:
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.710308502633559


In [15]:
print(rf.feature_importances_)

[0.1649474  0.15590296 0.15438482 0.19930198 0.0692513  0.1162057
 0.042634   0.09737184]


In [17]:
print(rf._parameter_constraints)

{'n_estimators': [<sklearn.utils._param_validation.Interval object at 0x000001E1CEA37510>], 'bootstrap': ['boolean'], 'oob_score': ['boolean', <built-in function callable>], 'n_jobs': [<class 'numbers.Integral'>, None], 'random_state': ['random_state'], 'verbose': ['verbose'], 'warm_start': ['boolean'], 'max_samples': [None, <sklearn.utils._param_validation.Interval object at 0x000001E1CEB5A490>, <sklearn.utils._param_validation.Interval object at 0x000001E1CEB5A550>], 'max_depth': [<sklearn.utils._param_validation.Interval object at 0x000001E1CEB2DED0>, None], 'min_samples_split': [<sklearn.utils._param_validation.Interval object at 0x000001E1CE8B8BD0>, <sklearn.utils._param_validation.Interval object at 0x000001E1CE8B8C50>], 'min_samples_leaf': [<sklearn.utils._param_validation.Interval object at 0x000001E1CEB2EB50>, <sklearn.utils._param_validation.Interval object at 0x000001E1CE8B8B10>], 'min_weight_fraction_leaf': [<sklearn.utils._param_validation.Interval object at 0x000001E1CE82