In [1]:
import pandas as pd
import altair as alt 
import numpy as np
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, precision_score
alt.data_transformers.enable('vegafusion')
set_config(transform_output="pandas")
np.random.seed(2000)

# Predicting Players with what Kind of Experience Level would Contribute the Most Playing Data to our Game

## Introduction

- provide some relevant background information on the topic so that someone unfamiliar with it will be prepared to understand the rest of your report?

|Column Name|	Data Type|	Description|
|-----------|------------|------------------------------|
|experience	|Object| Cateorigical varaible; player’s experience level ('Pro', 'Veteran', 'Amateur', 'Regular', 'Beginner')|
|subscribe	|Boolean|Indicates if the player is subscribed to certain features or services.|
|hashedEmail|Object	|Unique hashed email for each player (used for identification)|
|played_hours|Float64|Total hours played by the player.	Contains values from 0 to 223.1|
|name	|Object	|Player’s name,	196 unique values|
|gender	|Object	|Player’s gender (e.g., Male, Female), has 7 unique values|
|age	|Int64|	Player’s age; range from 8 to 99|
|individualId	|Float64	|Unique identifier for each player, **no recorded values**|
|organizationName|	Float64	|Name of the organization the player is affiliated with, **no recorded values**|



- The Question we tried to answer with our model is which "kinds" of players are most likely to contribute a large amount of data to target those players in our recruiting efforts.

## Methods & Results

In [2]:
#import/load data

url = "https://docs.google.com/spreadsheets/d/"
sheetId = '1I3LWCl5-1ZsDIEe168lmFMKynXqypCO3UL7RogvcT34'
players = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheetId}/export?format=csv")
players.head()

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,


In [3]:
#data wrangling 

# add additional age columns for feature engineering
players_more = players.assign(squared_age = players_wrangled['age']**2,
                              log_age = np.log(players_wrangled['age']),
                              sin_age = np.sin(players_wrangled['age']))

# select rows
players_wrangled = players_more[["played_hours", "age", "squared_age", "log_age", "sin_age", "experience"]]
players_wrangled.head()

NameError: name 'players_wrangled' is not defined

### Summary statistics 

In [None]:
players_wrangled['experience'].value_counts()

In [None]:
played_hours_range = (players_wrangled['played_hours'].min(), players_wrangled['played_hours'].max())
played_hours_range

In [None]:
age_range = (players_wrangled['age'].min(), players_wrangled['age'].max())
age_range

### Exploratory Data Analysis Visualization 

In [None]:
players_mean = players_wrangled.groupby("experience").mean().reset_index()
players_mean 

players_plot = alt.Chart(players_wrangled).mark_bar().encode(
    y=alt.Y("experience").title("experience"), 
    x=alt.X("mean(played_hours):Q").title("Mean Played Hours"),  # Aggregate played_hours to show the mean
    color=alt.Color("experience:N").title("Experience Level")  
)


In [None]:
players_plot = alt.Chart(players_wrangled).mark_circle().encode(
    x=alt.X("age").title("Age (years)"),
    y=alt.Y("played_hours").title("Played Hours"),
    color=alt.Color("experience").title("Experience Level")
)

players_plot

**Figure 1: Scatter plot showing the relationship between age and hours played color-coded to show the experience level of the players**

In [None]:
players_plot = alt.Chart(players_wrangled).mark_circle().encode(
    x=alt.X("experience").title("Experience"),
    y=alt.Y("played_hours").title("Played Hours"),
    color=alt.Color("experience").title("Experience Level")
)

players_plot

### Data Analysis

In [None]:
#players_train, players_test = train_test_split(players_wrangled, test_size=0.25, stratify = players_wrangled["gender"], random_state = 123)
players_train, players_test = train_test_split(players_wrangled_more, test_size=0.25, stratify = players_wrangled["experience"], random_state = 123)

#players_preprocessor = make_column_transformer(
#    (StandardScaler(), ["age", "played_hours"]),
#    verbose_feature_names_out=False
#)


#pre processor
players_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "age_squared", "log_age", "sin_age", "played_hours", "subscribe_encode"]),
)


players_scaled = players_preprocessor.fit_transform(players_train)
players_scaled


# players_plot_scaled = alt.Chart(players_scaled).mark_circle().encode(
#     x=alt.X("standardscaler__age").title("Age in Years (Standardized)").scale(zero = False),
#     y=alt.Y("standardscaler__played_hours").title("Played Hours(Standardized)").scale(zero = False),
#    # color=alt.Color("experience").title("Experience Level")
# )

# players_plot_scaled

In [None]:
X_train = players_train[["age", "age_squared", "log_age", "sin_age", "played_hours", "subscribe_encode"]]
y_train = players_train["experience"]

#knn = KNeighborsClassifier(n_neighbors=3)

#knn_pipeline = make_pipeline(players_preprocessor, knn)
#knn_pipeline.fit(X_train, y_train)

#knn_pipeline


# players_test["predicted"] = knn_pipeline.predict(players_test[["age", "played_hours"]])
# players_test.head()

In [None]:
#knn_pipeline.score(
#    players_test[["age", "played_hours"]],
#    players_test["experience"]
#)

## accuracy

In [None]:
knn = KNeighborsClassifier()
players_tune_pipe = make_pipeline(players_preprocessor, knn)



param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 18, 1),
}


In [None]:
players_tune_grid = GridSearchCV(
    estimator=players_tune_pipe,
    param_grid=param_grid,
    cv=5
)

players_tune_grid.fit(X_train, y_train)

accuracies_grid = pd.DataFrame(players_tune_grid.cv_results_)

In [None]:
accuracies_grid["sem_test_score"] = accuracies_grid["std_test_score"] / 10**(1/2)
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "sem_test_score"
    ]]
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
)

accuracies_grid.reset_index()

In [None]:
accuracies_grid = pd.DataFrame(players_tune_grid.cv_results_)

In [None]:
accuracies_grid["sem_test_score"] = accuracies_grid["std_test_score"] / 10**(1/2)
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "sem_test_score"
    ]]
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
)
accuracies_grid

In [None]:
accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("n_neighbors").title("Neighbors"),
    y=alt.Y("mean_test_score")
        .scale(zero=False)
        .title("Accuracy estimate")
)

accuracy_vs_k

**Figure 3: Plot showing the relationship K Neighnors and accuracy value we would acheive in our model**

In [None]:
players_tune_grid.best_params_

In [None]:
# players_test = players_test.assign(
#     predicted = players_tune_pipe.predict(players_test[["age", "played_hours"]])
# ).reset_index()

players_test["predicted"] = players_tune_grid.predict(
    players_test[["age", "age_squared", "log_age", "sin_age", "played_hours", "subscribe_encode"]]
)

players_test.head()

In [None]:
players_prediction_accuracy = players_tune_grid.score(
    players_test[["age", "age_squared", "log_age", "sin_age", "played_hours", "subscribe_encode"]],
    players_test["experience"]
)

players_prediction_accuracy

In [None]:
players_mat = pd.crosstab(
    players_test["gender"],  # True labels
    players_test["predicted"],  # Predicted labels
)

players_mat

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

X_test = players_test[["age", "age_squared", "log_age", "sin_age", "played_hours", "subscribe_encode"]]
y_test = players_test["experience"]

players_tune_pipe.fit(X_test, y_test)


ConfusionMatrixDisplay.from_estimator(
    players_tune_pipe,  # We are directly passing the pipeline and let sklearn do the predictions for us
    X_test,
    y_test
)

**Figure 4: Confusion matrix showing the performance of the classification model, with true positive, false positive, true negative, and false negative values**

In [None]:
test_data = pd.DataFrame({
    "age": np.random.randint(0, 60, 5),  # Random ages between 18 and 60
    "age_squared": np.random.randint(0, 2000, 5),
    "log_age": np.random.randint(0, 5, 5),
    "sin_age": np.random.randint(-1, 1, 5),
    "played_hours": np.random.randint(0, 200, 5),  # Random played hours between 100 and 2000
    "subscribe_encode": np.random.choice([0, 1], 5),  # Random subscription status (0 or 1)
    
})

# Predict gender (since the model was trained to predict gender)
players_tune_grid.predict(test_data)

In [None]:
#test_data1 = pd.DataFrame({
#    "age": [23],  # Random ages between 18 and 60
#    "played_hours": [150],  # Random played hours between 100 and 2000
#    "subscribe_encode": [0],  # Random subscription status (0 or 1)
#})

# Predict gender (since the model was trained to predict gender)
#players_tune_grid.predict(test_data1)

**Discussion**

In [None]:
hi