In [None]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

url = 'https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz'
df = pd.read_csv(url)
df = df.drop(columns=["individualId", "organizationName", "hashedEmail", "name"])
df["experience"] = df["experience"].astype("category")
df["gender"] = df["gender"].astype("category")
df = df.dropna(subset=["experience", "gender", "age", "played_hours", "subscribe"])
df["played_hours"] = df["played_hours"] + 1 #starts at 1 not 0.

scatter = alt.Chart(df).mark_circle(opacity=0.7).encode(
    x=alt.X("age", title="Player Age"),
    y=alt.Y("played_hours", title="Total Hours Played").scale(type="log"),
    color=alt.Color("subscribe:N", title="Subscribed")
        
).properties(
    title="Figure 1. Age vs Played Hours by Subscription Status",
)
scatter

In [None]:
# Load data
url = 'https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz'
df = pd.read_csv(url)

# Set seed
np.random.seed(1)

# Select features and target
X = df[["age", "played_hours"]]
y = df["subscribe"]

# Train-test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, train_size=0.80, stratify=y, random_state=1
)

# Preprocessing (standardize age + played_hours)
preprocess = make_column_transformer(
    (StandardScaler(), ["age", "played_hours"]),
)

# Build pipeline
knn = KNeighborsClassifier()
pipe = make_pipeline(preprocess, knn)

# Parameter grid for tuning k
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 51, 2)
}

# Grid search (5-fold CV)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(train_X, train_y)

# Best k
print("Best k:", grid.best_params_)

# Predictions on test data
test_pred = grid.predict(test_X)

# Evaluation metrics
print("Accuracy:", accuracy_score(test_y, test_pred))
print("Precision:", precision_score(test_y, test_pred))
print("Recall:", recall_score(test_y, test_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(pd.crosstab(test_y, test_pred,
                  rownames=["Actual"], colnames=["Predicted"]))



Matrix 
Accuracy: 0.75:

Your model is correct 75% of the time on the test set.

Precision: 0.7436 (When the model predicts a player will subscribe, how often is that correct?)

When KNN says “yes they will subscribe”,
it is right about 3 out of 4 times.


Recall: 1.0
Recall = 100%

Every actual subscriber was correctly predicted as “subscribe”.

No false negatives.
