In [None]:
#Importing the necessary packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

players = pd.read_csv('/players_20.csv')

In [None]:
print(" Dataset Length :: ", len(players))
print(" Dataset Shape :: ", players.shape)

In [None]:
for col in players.columns:
    print(col)

In [None]:
players.info()

In [None]:
print(" Dataset :: ", players.head())

In [None]:
# Removing Unwanted Columns

useless_column = ['dob', 'sofifa_id', 'player_url', 'long_name', 'body_type', 'real_face', 'nation_position', 'loaned_from', 'nation_jersey_number', 'player_traits']
players = players.drop(useless_column, axis = 1)

In [None]:
# To add a new column called main position

players['main_position'] = players['player_positions'].str.split(pat = ',', n = -1, expand = True)[0]
selected_columns = ['short_name', 'club', 'main_position', 'overall']
print(players[selected_columns].head(5))

In [None]:
# condensing weight and height columns to BMI index

players['bmi'] = players['weight_kg']/(players['height_cm']/100)**2
selected_columns = ['short_name', 'club', 'main_position', 'overall', 'bmi']
print(players[selected_columns].head(5))

In [None]:
# converting player field stats into int64 type & filling NULL values

columns = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']
players[columns].head()

In [None]:
for col in columns:
    players[col] = players[col].str.split('+', n = 1, expand = True)[0]
players[columns] = players[columns].fillna(0)
players[columns] = players[columns].astype(int)

players[columns]

In [None]:
# function to map object-type column to int64-type column

def column_mapping(column_name):
    unique_strings = players[column_name].unique()
    map = {string: i for i, string in enumerate(unique_strings)}
    players[column_name] = players[column_name].map(map)

In [None]:
#mapping main position of players to an integer

unique_values = players['main_position'].unique()
for i in range (len(unique_values)):
    print (i, '\t', unique_values[i])
column_mapping('main_position')
print(players[['short_name', 'main_position']].head())

In [None]:
# selecting all columns with type as 'object'

object_columns = players.select_dtypes(include = ['object']).columns
for col in object_columns:
    print(col)

In [None]:
for col in object_columns:
    column_mapping(col)
print(players[object_columns].head())

In [None]:
# filling null values in stats

columns = ['dribbling', 'defending', 'physic', 'passing', 'shooting', 'pace']
players[columns]

for col in columns:
    players[col] = players[col].fillna(players[col].median())
players[columns]

In [None]:
players = players.fillna(0)
players.isnull().sum()

EXPLORATORY DATA ANALYTICS

In [None]:
# To identify the most valuable position

players_grouped = players.groupby('main_position')['value_eur'].mean()/1e6
players_grouped = players_grouped.sort_values()
players_grouped.plot(kind = 'barh', figsize = (12, 8))
plt.title("Average Player Value in Each Position")
plt.xlabel("Average Value, M euro")
plt.show

In [None]:
# Age Dependence

players_grouped_age = players.groupby('age')['value_eur'].mean()/1e6
players_grouped_age.plot(grid = True, figsize = (12, 8))
plt.ylabel(' Average Value, M Euro ')
plt.xlabel('Age')
plt.show

In [None]:
# Top 10 most expensive national teams (grouped by mean of top 25 players)

players_country = players.groupby('nationality')['value_eur'].nlargest(25).reset_index(level = 1, drop = True)
players_country = players_country.groupby('nationality').mean()/1e6
players_country_top10=(players_country.sort_values()).tail(10)
players_country_top10.plot(kind='barh',figsize=(12,8))
plt.xlabel("Average value of TOP25 players, M euro")
plt.show

In [None]:
# Overall Rating VS Value in Euros

import seaborn as sns
sns.relplot(x='overall',y='value_eur',hue='age',palette = 'viridis',size="bmi", sizes=(15, 200),aspect=2,data=players)
plt.title('Overall Rating vs Value in Euros',fontsize = 20)
plt.xlabel('Overall Rating')
plt.ylabel('Value in Euros')
plt.show()

In [None]:
# Potential Rating VS Wage in Euros

sns.relplot(x='potential',y='wage_eur',hue='age',palette = 'viridis',size="bmi", sizes=(15, 200),aspect=2,data=players)
plt.title('Potential Rating vs Wage in Euros',fontsize = 20)
plt.xlabel('Potential')
plt.ylabel('Wage in Euros')
plt.show()

In [None]:
# Count of Preferred Foot

plt.figure(dpi=125)
sns.countplot(data = players,x = 'preferred_foot',hue ='preferred_foot',dodge = False, legend = False, palette='Blues')
plt.xlabel('Preferred Foot Players')
plt.ylabel('Count')
plt.title('Count of Preferred Foot')
Right,Left= players.preferred_foot.value_counts()
print('Left Preferred',Left)
print('Right Preferred',Right)
plt.show()

In [None]:
# Count of International Reputation of Top 100 Players

plt.figure(dpi=125)
sns.countplot(x = 'international_reputation', hue = 'international_reputation',data = players.head(100),palette='Blues', dodge = False, legend = False)
plt.xlabel('International Reputation')
plt.ylabel('Count')
plt.title('Count of International Reputation of Top 100 Players')
plt.show()

In [None]:
# Weight VS Pace

plt.figure(figsize=(12,6),dpi=125)
x=players['weight_kg']
y=players['pace']

sns.regplot(x = x,y = y,color='orange')
plt.title('Weight vs Pace',fontsize = 20)
plt.xlabel('Weight')
plt.ylabel('Pace')
plt.show()

In [None]:
# BMI vs Pace

plt.figure(figsize=(12,6),dpi=125)
x=players['bmi']
y=players['pace']

sns.regplot(x=x,y=y,color='skyblue')
plt.title('BMI vs Pace',fontsize=20)
plt.xlabel('BMI')
plt.ylabel('Pace')
plt.show()

In [None]:
plt.figure(figsize=(14,7),dpi=150)
sns.countplot(hue = 'club', x = 'club',data=players.head(20),palette='CMRmap', legend = False, dodge = False)
plt.xlabel('Club')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.title('Which Club has Max players in Top 20?',fontsize = 20)
plt.show()

In [None]:
plt.hist(players['wage_eur'], bins = 20, edgecolor = 'black')
plt.xlabel("Player Value Ranges")
plt.ylabel("Frequency")
plt.title("Histogram of Player Value Ranges")
plt.show

In [None]:
selected_columns = ["age","height_cm", "weight_kg", "overall", "potential", "value_eur", "main_position",
                    "preferred_foot", "international_reputation","weak_foot","skill_moves","release_clause_eur","pace",
                    "shooting", "passing", "dribbling", "defending", "physic", "gk_diving", "gk_handling", "gk_kicking", "gk_reflexes",
                    "gk_speed", "gk_positioning", "attacking_crossing", "attacking_finishing",
                    "attacking_heading_accuracy", "attacking_short_passing", "attacking_volleys","skill_dribbling", "skill_curve",
                    "skill_fk_accuracy", "skill_long_passing", "skill_ball_control","movement_acceleration",
                    "movement_sprint_speed", "movement_agility", "movement_reactions", "movement_balance",
                    "power_shot_power", "power_jumping", "power_stamina", "power_strength", "power_long_shots",
                    "mentality_aggression", "mentality_interceptions", "mentality_positioning", "mentality_vision","mentality_penalties",
                    "mentality_composure","defending_marking","defending_standing_tackle", "defending_sliding_tackle", "goalkeeping_diving",
                    "goalkeeping_handling", "goalkeeping_kicking", "goalkeeping_positioning", "goalkeeping_reflexes" ]
players[selected_columns].info()

    MACHINE LEARNING MODEL

In [None]:
from sklearn.model_selection import train_test_split

features = ['wage_eur', 'overall', 'release_clause_eur', 'skill_long_passing', 'international_reputation']

X = players[features]
y = players['value_eur']

X = X.dropna()
y = y.loc[X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(criterion= "squared_error", max_depth=8, min_samples_split = 10, random_state = 42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R²) Score: {r2:.2f}')
print(f'Tree Depth: {model.get_depth()}')
print(f'Number of Leaves: {model.get_n_leaves()}')

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X_test, y_test, cv = 10)