<a href="https://www.kaggle.com/code/ocanaydin/player-value-pred?scriptVersionId=113972860" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/most-expensive-footballers-2021/players.csv")
df.head(5)

**CHECK IF ANY NAN DATA**

In [None]:
"""It returns false so we can say that there is no nan data."""
df.isnull().values.any()

In [None]:
"""We can drop Unnamed columns because it is meanless."""
df.drop(columns = ["Unnamed: 0"],inplace = True)

In [None]:
print(df["Markey Value In Millions(£)"].describe())

In [None]:
df_country = df.groupby(["Country"]).agg({"Markey Value In Millions(£)" : "sum"}).sort_values(
    "Markey Value In Millions(£)",ascending = False).reset_index()
df_country

**PREPROCESSING OF DATA**

In [None]:
"""Convert categorical features to numerical except player names."""
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
categorical_features =["Position","Country","Club"]
for key in categorical_features:
    transformed_data = LE.fit_transform(df[key])
    df[key] = transformed_data

In [None]:
df.head(5)

**GET CORRELATION MATRIX FOR FEATURE SELECTION**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_copy = df.copy()
df_copy.drop(columns = ["Name"],inplace = True)

corr_mat = df_copy.corr() 
k = len(df_copy.columns)

cols = corr_mat.nlargest(k,"Markey Value In Millions(£)")["Markey Value In Millions(£)"].index
cm = np.corrcoef(df_copy[cols].values.T)
sns.set(font_scale = 1)
plt.subplots(figsize = (20,10))
heat_map = sns.heatmap(cm,cbar = True,annot = True,square = True,fmt = ".2f",annot_kws = {"size" :12},
                      yticklabels = cols.values,xticklabels = cols.values)
plt.show()

**STANDARTIZATION OF DATA**

In [None]:
from sklearn.preprocessing import StandardScaler
"""We have to seperate name and value columns from dataset because we dont need to standardizate them."""
player_names = df["Name"]
player_values = df["Markey Value In Millions(£)"]
df.drop(columns = ["Name","Markey Value In Millions(£)"],inplace = True)
"""Now we can apply standardization."""
SS =StandardScaler()
scaled_x = SS.fit_transform(df)
"""Convert it to dataframe."""
X = pd.DataFrame(scaled_x,columns = df.columns)
X

**SPLIT DATA AS TRAIN AND TEST.**

In [None]:
Y = player_values
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,random_state = 43)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

**FIT MODEL AND PREDICT**

In [None]:
import xgboost

In [None]:
xgb_regressor = xgboost.XGBRegressor(booster = "gbtree",gamma = 0.3,reg_lambda = 2.1,n_estimators = 50,
                                      learning_rate = 0.08,max_depth = 10,min_child_weight = 1.5)
xgb_regressor.fit(X_train,Y_train)

**PREDICTION**

In [None]:
preds = xgb_regressor.predict(X_test)
preds = pd.DataFrame(preds,columns = ["preds"],index = Y_test.index)

In [None]:
table = pd.concat([Y_test,preds],axis = 1)
table = pd.concat([player_names[Y_test.index],table],axis = 1)

In [None]:
table

In [None]:
from sklearn.metrics import mean_squared_error
mse_error = mean_squared_error(table["Markey Value In Millions(£)"],table["preds"])
print(f"MSE Error : {mse_error}")