In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_csv("data/allHorizonData_cut.csv")

# data setup

base data
- based on `gameLength` and `uc` 
- predict `c5`

In [4]:

df_filtered = df[df["c5"].notna()][["gameLength", "uc", "c5"]].dropna()

# One-hot encode gameLength and uc
X = pd.get_dummies(df_filtered[["gameLength", "uc"]], columns=["gameLength", "uc"])
y = df_filtered["c5"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X.head())

   gameLength_5  gameLength_10   uc_1   uc_2   uc_3
0          True          False  False  False   True
1         False           True  False  False   True
2         False           True  False   True  False
3         False           True  False   True  False
4         False           True  False   True  False


In [6]:
print(y.head())

0    2
1    1
2    1
3    1
4    2
Name: c5, dtype: int64


additional info
- `c1-5` `r1-4`

In [7]:
extended_features = ['gameLength', 'uc', 'r1', 'r2', 'r3', 'r4', 'c1', 'c2', 'c3', 'c4', 'c5']
df_extended = df[extended_features].dropna()

X_ext = pd.get_dummies(df_extended.drop(columns="c5"), columns=["gameLength", "uc", "c1", "c2", "c3", "c4"])
y = df_extended["c5"]

X_train_ext, X_test_ext, y_train, y_test = train_test_split(X_ext, y, test_size=0.2, random_state=42)

In [8]:
print(X_ext.head())

   r1  r2  r3  r4  gameLength_5  gameLength_10   uc_1   uc_2   uc_3   c1_1  \
0  66  80  29  75          True          False  False  False   True  False   
1  69  50  51  64         False           True  False  False   True  False   
2  31  43  26  36         False           True  False   True  False  False   
3  65  77  52  73         False           True  False   True  False   True   
4  70  19  43  41         False           True  False   True  False  False   

    c1_2   c2_1   c2_2   c3_1   c3_2   c4_1   c4_2  
0   True  False   True   True  False  False   True  
1   True  False   True   True  False  False   True  
2   True   True  False  False   True   True  False  
3  False  False   True  False   True   True  False  
4   True   True  False  False   True   True  False  


# vanilla logistic regression

In [9]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)
report_log = classification_report(y_test, y_pred_log)
print(report_log)

              precision    recall  f1-score   support

           1       0.53      0.53      0.53      1941
           2       0.52      0.53      0.53      1899

    accuracy                           0.53      3840
   macro avg       0.53      0.53      0.53      3840
weighted avg       0.53      0.53      0.53      3840



In [10]:
log_model_ext = LogisticRegression(max_iter=1000)
log_model_ext.fit(X_train_ext, y_train)

y_pred_ext = log_model_ext.predict(X_test_ext)
report_ext = classification_report(y_test, y_pred_ext)
print(report_ext)

              precision    recall  f1-score   support

           1       0.55      0.49      0.52      1941
           2       0.53      0.58      0.55      1899

    accuracy                           0.54      3840
   macro avg       0.54      0.54      0.54      3840
weighted avg       0.54      0.54      0.54      3840



# dtree

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
dtree_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree_model.fit(X_train, y_train)

y_pred_dtree_base = dtree_model.predict(X_test)
report_dtree_base = classification_report(y_test, y_pred_dtree_base)
print(report_dtree_base)

              precision    recall  f1-score   support

           1       0.52      0.64      0.58      1941
           2       0.52      0.40      0.45      1899

    accuracy                           0.52      3840
   macro avg       0.52      0.52      0.51      3840
weighted avg       0.52      0.52      0.51      3840



In [13]:

dtree_ext_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dtree_ext_model.fit(X_train_ext, y_train)

y_pred_dtree_ext = dtree_ext_model.predict(X_test_ext)
report_dtree_ext = classification_report(y_test, y_pred_dtree_ext)
print(report_dtree_ext)

              precision    recall  f1-score   support

           1       0.72      0.68      0.70      1941
           2       0.69      0.73      0.71      1899

    accuracy                           0.70      3840
   macro avg       0.70      0.70      0.70      3840
weighted avg       0.70      0.70      0.70      3840



# random forest

In [14]:
from sklearn.ensemble import RandomForestClassifier


In [15]:

rf_base_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_base_model.fit(X_train, y_train)

y_pred_rf_base = rf_base_model.predict(X_test)
report_rf_base = classification_report(y_test, y_pred_rf_base)
print(report_rf_base)

              precision    recall  f1-score   support

           1       0.54      0.40      0.46      1941
           2       0.52      0.65      0.58      1899

    accuracy                           0.53      3840
   macro avg       0.53      0.53      0.52      3840
weighted avg       0.53      0.53      0.52      3840



In [16]:

rf_ext_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_ext_model.fit(X_train_ext, y_train)

y_pred_rf_ext = rf_ext_model.predict(X_test_ext)
report_rf_ext = classification_report(y_test, y_pred_rf_ext)
print(report_rf_ext)

              precision    recall  f1-score   support

           1       0.73      0.71      0.72      1941
           2       0.71      0.74      0.72      1899

    accuracy                           0.72      3840
   macro avg       0.72      0.72      0.72      3840
weighted avg       0.72      0.72      0.72      3840



# MLP

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [30]:
mlp_etx = MLPClassifier(hidden_layer_sizes=(32,), max_iter=300, random_state=42)
mlp_etx.fit(X_train_ext, y_train)

y_pred_mlp_ext = mlp_etx.predict(X_test_ext)
report_mlp_ext = classification_report(y_test, y_pred_mlp_ext)
print(report_mlp_ext)

              precision    recall  f1-score   support

           1       0.82      0.76      0.79      1941
           2       0.77      0.83      0.80      1899

    accuracy                           0.79      3840
   macro avg       0.80      0.79      0.79      3840
weighted avg       0.80      0.79      0.79      3840



# Split groups

In [17]:
is_h1 = df_extended["gameLength"] == 5
is_h6 = df_extended["gameLength"] == 10

X_h1 = X_ext[is_h1]
y_h1 = y[is_h1]

X_h6 = X_ext[is_h6]
y_h6 = y[is_h6]

X_train_h1, X_test_h1, y_train_h1, y_test_h1 = train_test_split(X_h1, y_h1, test_size=0.2, random_state=42)
X_train_h6, X_test_h6, y_train_h6, y_test_h6 = train_test_split(X_h6, y_h6, test_size=0.2, random_state=42)


## RF

In [27]:
# RF for Horizon 1
rf_h1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_h1.fit(X_train_h1, y_train_h1)
report_rf_h1 = classification_report(y_test_h1, rf_h1.predict(X_test_h1))
print(report_rf_h1)

              precision    recall  f1-score   support

           1       0.77      0.85      0.80       946
           2       0.84      0.75      0.79       974

    accuracy                           0.80      1920
   macro avg       0.80      0.80      0.80      1920
weighted avg       0.80      0.80      0.80      1920



In [26]:
# RF for Horizon 6
rf_h6 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_h6.fit(X_train_h6, y_train_h6)
report_rf_h6 = classification_report(y_test_h6, rf_h6.predict(X_test_h6))
print(report_rf_h6)

              precision    recall  f1-score   support

           1       0.70      0.67      0.69       922
           2       0.71      0.73      0.72       998

    accuracy                           0.71      1920
   macro avg       0.71      0.70      0.70      1920
weighted avg       0.71      0.71      0.71      1920



## MLP

In [28]:
mlp_h1 = MLPClassifier(hidden_layer_sizes=(32,), max_iter=300, random_state=42)
mlp_h1.fit(X_train_h1, y_train_h1)
mlp_report_h1 = classification_report(y_test_h1, mlp_h1.predict(X_test_h1))
print(mlp_report_h1)

              precision    recall  f1-score   support

           1       0.82      0.87      0.85       946
           2       0.87      0.82      0.84       974

    accuracy                           0.84      1920
   macro avg       0.85      0.84      0.84      1920
weighted avg       0.85      0.84      0.84      1920



In [29]:
mlp_h6 = MLPClassifier(hidden_layer_sizes=(32,), max_iter=300, random_state=42)
mlp_h6.fit(X_train_h6, y_train_h6)
mlp_report_h6 = classification_report(y_test_h6, mlp_h6.predict(X_test_h6))
print(mlp_report_h6)

              precision    recall  f1-score   support

           1       0.74      0.63      0.68       922
           2       0.70      0.79      0.74       998

    accuracy                           0.72      1920
   macro avg       0.72      0.71      0.71      1920
weighted avg       0.72      0.72      0.71      1920

