# Imports 

In [190]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_validate

np.random.seed(31)

# import the K-NN regression model
from sklearn.neighbors import KNeighborsRegressor

# Exploratory Data Analysis and Visualization

In [191]:
#the loaded dataset below: 

players = pd.read_csv("data/players.csv")
players.head()

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,


In [192]:
# data wrangling 

# change experience labels to numerical values 
players["experience_encode"] = players["experience"].replace({
    "Beginner" : 1,
    "Regular" : 2, 
    "Amateur" : 3, 
    "Veteran" : 4, 
    "Pro": 5,
})

# select three rows
players_wrangled = players[["experience", "experience_encode","played_hours"]]
players_wrangled.head() 

  players["experience_encode"] = players["experience"].replace({


Unnamed: 0,experience,experience_encode,played_hours
0,Pro,5,30.3
1,Veteran,4,3.8
2,Veteran,4,0.0
3,Amateur,3,0.7
4,Regular,2,0.1


## Summary of the data set

In [193]:
# summary of data set
players_wrangled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   experience         196 non-null    object 
 1   experience_encode  196 non-null    int64  
 2   played_hours       196 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.7+ KB


## do we need to say 

In [194]:
players_descriptive = players_wrangled.groupby("experience")["played_hours"].agg(["mean", "max", "min"]).reset_index()
players_descriptive

Unnamed: 0,experience,mean,max,min
0,Amateur,6.01746,150.0,0.0
1,Beginner,1.248571,23.7,0.0
2,Pro,2.6,30.3,0.0
3,Regular,18.208333,223.1,0.0
4,Veteran,0.647917,12.5,0.0


## what is here

In [195]:
exploratory_plot = alt.Chart(players_descriptive, title="Mean Played Hours vs. Experience").mark_bar().encode(
    x = alt.X("experience").title("Experience Level"), 
    y = alt.Y("mean").title("Mean Played Hours"), 
    color = alt.Color("experience:N").title("Experience Level"), 
)

exploratory_plot

## figure cap

In [196]:
# exploratory data visualization - SCATTER PLOT

exploratory_plot = alt.Chart(players_wrangled, title="Played Hours vs. Experience").mark_circle().encode(
    x = alt.X("experience").title("Experience Level (1-5)"), 
    y = alt.Y("played_hours").title("Played Hours"), 
    color = alt.Color("experience:N").title("Experience Level"), 
)

exploratory_plot

## figure cap

# Data Analysis 

## KNN regression

In [197]:
players_training, players_testing = train_test_split(
    players_wrangled,
    test_size=0.25,
    random_state=2000, 
)

X_train = players_training[["experience_encode"]]  
y_train = players_training["played_hours"] 

X_test = players_testing[["experience_encode"]] 
y_test = players_testing["played_hours"]

In [198]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler


players_preprocessor = make_column_transformer(StandardScaler(), ["experience_encode"])

players_pipe = make_pipeline(
    players_preprocessor, 
    KNeighborsRegressor()
)

players_cv = pd.DataFrame(
    cross_validate(
        estimator = players_pipe,
        cv = 5, 
        X = X_train, 
        y = y_train,
        scoring="neg_root_mean_squared_error",
        return_train_score=True,
    )
)

players_cv

TypeError: 'StandardScaler' object is not iterable

In [199]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
import pandas as pd

# Correct usage of make_column_transformer
players_preprocessor = make_column_transformer(
    (StandardScaler(), ["experience_encode"])  # Apply StandardScaler to 'experience_encode' column
)

# Create the pipeline
players_pipe = make_pipeline(
    players_preprocessor, 
    KNeighborsRegressor(),
)

# Cross-validation (if you want to see the output of cross-validation)
players_cv = pd.DataFrame(
    cross_validate(
        estimator=players_pipe,
        X=X_train,  # Make sure X is defined as your feature matrix
        y=y_train,  # Make sure y is defined as your target variable
        cv=5,  # Specify the number of cross-validation folds
        return_train_score=True
    )
)
players_cv

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.003824,0.002726,-0.05227,-0.029854
1,0.003066,0.001994,-0.041736,-0.038456
2,0.003064,0.002028,-6.665354,-0.037698
3,0.002984,0.001959,-0.062968,-0.055593
4,0.00297,0.001953,0.110312,-0.019997


In [200]:
param_grid = {"n_neighbors": range(1,100,1)}   # why is it kneighborsregressor__n_neighbors this time 

players_tuned = GridSearchCV(estimator = KNeighborsRegressor(), 
                              param_grid = param_grid,
                              cv = 5, 
                              n_jobs = -1, 
                              scoring="neg_root_mean_squared_error")

players_results = pd.DataFrame(players_tuned.fit(X_train, y_train).cv_results_) 

players_results

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001364,0.000157,0.001392,0.000135,1,{'n_neighbors': 1},-9.486622,-32.560605,-2.617843,-42.940861,-11.552653,-19.831717,15.283576,1
1,0.001186,0.000014,0.001286,0.000127,2,{'n_neighbors': 2},-9.401742,-32.547271,-15.800868,-44.388767,-14.025774,-23.232884,13.157777,99
2,0.001171,0.000009,0.001210,0.000009,3,{'n_neighbors': 3},-9.445703,-32.545865,-10.708363,-43.341240,-11.425606,-21.493355,13.873065,97
3,0.001208,0.000071,0.001584,0.000708,4,{'n_neighbors': 4},-9.440520,-32.551276,-8.235070,-42.976134,-10.723147,-20.785229,14.271163,72
4,0.001178,0.000021,0.001231,0.000008,5,{'n_neighbors': 5},-9.358608,-32.581639,-6.855589,-42.851246,-10.565428,-20.442502,14.522562,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.001033,0.000007,0.001479,0.000012,95,{'n_neighbors': 95},-9.573079,-31.927716,-6.489324,-42.137262,-11.047636,-20.235003,14.166556,8
95,0.001032,0.000004,0.001485,0.000005,96,{'n_neighbors': 96},-9.883836,-31.911813,-6.427837,-42.141384,-11.174493,-20.307873,14.114448,14
96,0.001030,0.000002,0.001486,0.000008,97,{'n_neighbors': 97},-9.858665,-31.914775,-6.370851,-42.141252,-11.466693,-20.350447,14.092555,21
97,0.001032,0.000004,0.001489,0.000012,98,{'n_neighbors': 98},-9.836866,-31.919345,-6.307389,-42.147790,-11.454509,-20.333180,14.112708,17


In [201]:
best_param = players_tuned.best_params_
best_param

players_min = best_param
players_min

{'n_neighbors': 1}

In [202]:
best_model = -players_tuned.best_score_  #negative!!
best_model 

players_best_RMSPE = best_model
players_best_RMSPE

np.float64(19.831717009461762)

In [203]:
players_prediction = players_tuned.predict(X_test)
players_summary = mean_squared_error(
    y_true= y_test, 
    y_pred= players_prediction)**(1/2)

players_summary  # RMSPE on test data

np.float64(38.13550758513003)

In [204]:
players_preds = players_training.assign(
    predictions= players_tuned.predict(X_train)
)

players_plot = alt.Chart(players_preds).mark_circle(opacity = 0.4).encode(
    x = alt.X("experience").title("experience level"), 
    y = alt.Y("played_hours").title("playing time")
) + alt.Chart(players_preds).mark_line(color='black').encode(
    x='experience',
    y='predictions'
)


players_plot

In [205]:
predictions_beginner= players_tuned.predict(1)



ValueError: Expected 2D array, got scalar array instead:
array=1.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [206]:
# Example new observation for a 'Beginner'
new_observation = np.array([[1, 0.5]])  # Adjust based on your feature columns
# Or use a DataFrame
new_observation_df = pd.DataFrame([[1]], columns=['experience'])  # Replace 'some_other_feature' with actual feature names

# Predict using the trained model
predictions_beginner = players_tuned.predict(new_observation) 



ValueError: X has 2 features, but KNeighborsRegressor is expecting 1 features as input.