In [157]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [158]:
url = "https://raw.githubusercontent.com/agallagh/DSCI-Project/refs/heads/main/players.csv"
players_data = pd.read_csv(url)
players_data

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [159]:
# tidying the data by dropping the unecessary columns

tidy_players = players_data[["experience", "played_hours", "age"]]
tidy_players

# filtering data in age column for our demographic (age less than 60)

filtered_age_df = tidy_players[tidy_players["age"] < 60]
filtered_age_df

Unnamed: 0,experience,played_hours,age
0,Pro,30.3,9
1,Veteran,3.8,17
2,Veteran,0.0,17
3,Amateur,0.7,21
4,Regular,0.1,21
...,...,...,...
190,Amateur,0.0,20
191,Amateur,0.0,17
192,Veteran,0.3,22
193,Amateur,0.0,17


In [160]:
#filtering played hours to under 100 to remove outliers

filtered_hrs_df = filtered_age_df[filtered_age_df["played_hours"] < 100]

In [161]:
np.random.seed(5000) 

#Train model

# split the train/test data: 75% train, 25% test
player_train, player_test = train_test_split(filtered_hrs_df, train_size = 0.75, random_state = 5000)

# make X/y objects
X_train = player_train[['played_hours',	'age']]
y_train = player_train['experience']

# make X/y testing objects
X_test = player_test[['played_hours',	'age']]
y_test = player_test['experience']


#make a preprocessor for the hours played and age columns
player_preprocessor = make_column_transformer(
    (StandardScaler(), ['played_hours',	'age']),
     remainder='passthrough',
    verbose_feature_names_out = False
)

#fit the data to the 

#knn_spec = KNeighborsClassifier(n_neighbors = n, random_seed = 2000)

In [162]:
knn_spec = KNeighborsClassifier()

param_grid = {
    "n_neighbors": range(2, 16, 1),
}

knn_tune_grid = GridSearchCV(
    knn_spec, param_grid, return_train_score=True, n_jobs=-1, cv=5
)

knn_model_grid = knn_tune_grid.fit(X_train, y_train)

accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)

cross_val_plot = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_n_neighbors").title("Number of neighbours").scale(zero=False),
    y=alt.Y("mean_test_score").title("Mean test score").scale(zero=False)
)

cross_val_plot

In [163]:
#explain which is best, confirm with 
knn_tune_grid.best_params_['n_neighbors']

5

In [164]:
#testing accuracy
model_accuracy = knn_model_grid.score(X_test, y_test)
model_accuracy

0.3125

In [165]:
np.random.seed(3131)

# # #something else
knn_best = KNeighborsClassifier(knn_tune_grid.best_params_['n_neighbors'])

player_preprocessor = make_column_transformer(
    (StandardScaler(), ['played_hours',	'age']),
     remainder='passthrough',
    verbose_feature_names_out = False
)

players_test_fit = make_pipeline(player_preprocessor, knn_best).fit(X_test, y_test)


player_test_predictions = player_test.assign(
    predicted=players_test_fit.predict(X_test)
)

player_test_predictions

Unnamed: 0,experience,played_hours,age,predicted
34,Beginner,0.6,26,Amateur
98,Amateur,0.0,17,Amateur
2,Veteran,0.0,17,Amateur
25,Regular,0.6,28,Amateur
184,Pro,1.7,17,Pro
148,Veteran,0.0,18,Amateur
103,Beginner,2.0,27,Amateur
1,Veteran,3.8,17,Pro
182,Pro,0.2,17,Amateur
94,Beginner,0.8,22,Amateur


In [200]:
predicted_plot = alt.Chart(player_test_predictions).mark_point().encode(
    x= alt.X('age').title("Age of Player").scale(zero=False),
    y= alt.Y('played_hours').title("Number of Hours Played").scale(zero=False),
    color=alt.Color('predicted:N').title("Predicted Experience Level")
)

predicted_plot

In [235]:
np.random.seed(5000)

num_rows=20
var1 = np.random.randint(1,31, num_rows)
var2 = np.random.randint(1,31, num_rows)

df=pd.DataFrame({
    "played_hours":var1,
    "age":var2
})

df2 = pd.DataFrame(df)

#filtered for higher played hours (above 5hrs)
filtered_points = df2[df2['played_hours'] > 5]
filtered_points

Unnamed: 0,played_hours,age
0,28,29
4,30,13
6,15,13
9,28,17
10,22,27
11,27,21
12,20,2
14,18,8
15,15,1
16,11,8


In [236]:
#fitting new points to the model
player_prediction_all = players_test_fit.predict(filtered_points)

points_predicted = pd.DataFrame(player_prediction_all)

#adding played hours column back into dataframe

points_predicted_cols = points_predicted.assign(played_hours = filtered_points['played_hours'])

#dropping NaN points and renaming columns
high_hours_predictions = points_predicted_cols.dropna().rename(columns = {0:"predicted_experience"})

#finding the mean playing time of each predicted experience level 
high_hours_predictions.groupby('predicted_experience').agg("mean")

high_hours_predictions.sort_values('predicted_experience')

Unnamed: 0,predicted_experience,played_hours
0,Amateur,28.0
4,Amateur,30.0
10,Amateur,22.0
6,Pro,15.0
9,Pro,28.0


In [237]:
high_hours_predictions.groupby('predicted_experience').agg("mean")

Unnamed: 0_level_0,played_hours
predicted_experience,Unnamed: 1_level_1
Amateur,26.666667
Pro,21.5


In [199]:
test_plot = alt.Chart(player_test).mark_point().encode(
    x= alt.X('age').title("Age of Player").scale(zero=False),
    y= alt.Y('played_hours').title("Number of Hours Played").scale(zero=False),
    color=alt.Color('experience:N').title("Experience Level")
)

test_plot