In [207]:
#download the packages to use
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [208]:
#load the trainings and test data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [209]:
train.shape


(6895, 70)

In [210]:
# Identify features with missing values
features_with_missing_values = train.columns[train.isnull().any()].tolist()

# Display the count of missing values for each feature
missing_values_count = train[features_with_missing_values].isnull().sum()

# Print the results
print("Features with missing values:")
print(features_with_missing_values)

print("\nCount of missing values for each feature:")
print(missing_values_count)

Features with missing values:
['value_eur', 'club_loaned_from', 'club_joined', 'nation_jersey_number', 'release_clause_eur', 'player_tags', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'goalkeeping_speed']

Count of missing values for each feature:
value_eur                  2
club_loaned_from        6544
club_joined              351
nation_jersey_number    6482
release_clause_eur       353
player_tags             6124
player_traits           2589
pace                     631
shooting                 631
passing                  631
dribbling                631
defending                631
physic                   631
goalkeeping_speed       6264
dtype: int64


In [211]:
test.shape

(767, 69)

In [212]:
# Identify features with missing values
features_with_missing_values = test.columns[test.isnull().any()].tolist()

# Display the count of missing values for each feature
missing_values_count = test[features_with_missing_values].isnull().sum()

# Print the results
print("Features with missing values:")
print(features_with_missing_values)

print("\nCount of missing values for each feature:")
print(missing_values_count)

Features with missing values:
['value_eur', 'club_loaned_from', 'club_joined', 'nation_jersey_number', 'release_clause_eur', 'player_tags', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'goalkeeping_speed']

Count of missing values for each feature:
value_eur                 3
club_loaned_from        722
club_joined              45
nation_jersey_number    716
release_clause_eur       48
player_tags             682
player_traits           285
pace                     70
shooting                 70
passing                  70
dribbling                70
defending                70
physic                   70
goalkeeping_speed       697
dtype: int64


In [213]:
#create a new column called test_train to identify after which data belongs to test and train data
test['test_train']=0
train['test_train']=1


In [214]:
# concat train data with test data to have more observations, clean it all together and improve accuracy
all_data = pd.concat([train,test])

In [215]:
pd.set_option('display.max_columns', None)

# 1.Data Preprocessing and Quality Assurance: 

In [216]:
all_data.head()

Unnamed: 0,id,short_name,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,club_name,league_name,league_level,club_jersey_number,club_loaned_from,club_joined,club_contract_valid_until,nationality_name,nation_jersey_number,preferred_foot,weak_foot,skill_moves,international_reputation,work_rate,body_type,real_face,release_clause_eur,player_tags,player_traits,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,position,test_train
0,216302,E. García,71,71,1400000.0,10000,1989-12-28,176,73,Club Atlético de San Luis,Mexican Liga MX,1,29,,2021-07-12,2026,Mexico,,Right,5,3,1,Medium/High,Normal (170-185),No,2400000.0,,Early Crosser,70.0,52.0,60.0,70.0,67.0,66.0,64,40,56,65,41,68,64,58,58,68,71,70,87,61,86,67,77,86,58,66,56,79,53,48,58,66,65,66,65,14,11,12,12,12,,LB,1
1,237867,D. Cancola,65,71,1000000.0,2000,1996-10-23,183,73,Ross County FC,Scottish Premiership,1,4,,2021-07-23,2022,Austria,,Right,3,2,1,Medium/Medium,Lean (170-185),No,2000000.0,,,65.0,38.0,58.0,60.0,63.0,67.0,50,31,59,64,31,57,36,41,64,65,64,65,57,62,67,49,62,71,66,42,66,66,48,62,46,59,65,61,58,10,13,7,6,11,,LDM,1
2,253472,E. Kahl,65,77,1600000.0,2000,2001-09-27,178,69,Aarhus GF,Danish Superliga,1,19,,2021-07-24,2026,Sweden,,Left,3,3,1,High/Medium,Normal (170-185),No,2300000.0,,Speed Dribbler (AI),79.0,35.0,58.0,66.0,59.0,64.0,59,30,53,60,37,65,56,35,55,64,82,77,71,62,75,50,65,67,61,28,69,62,58,60,31,65,60,58,59,10,10,8,10,11,,LWB,1
3,223994,S. Mugoša,72,72,2300000.0,5000,1992-02-26,188,81,Incheon United FC,Korean K League 1,1,9,,2018-02-08,2023,Montenegro,,Right,3,3,1,High/Medium,Lean (185+),No,2900000.0,,Finesse Shot,64.0,74.0,51.0,68.0,24.0,76.0,32,78,75,59,70,65,48,53,45,74,61,66,64,67,53,71,71,75,87,69,52,18,76,60,75,75,16,22,19,16,15,13,8,9,,LS,1
4,251635,A. Țigănașu,65,65,525000.0,3000,1990-06-12,179,74,FC Botoşani,Romanian Liga I,1,30,,2019-07-01,2022,Romania,,Left,2,2,1,Medium/High,Normal (170-185),No,709000.0,,Solid Player,74.0,53.0,59.0,53.0,61.0,69.0,64,38,49,63,46,43,53,58,55,56,71,76,70,60,71,72,80,78,70,67,53,63,60,52,52,58,64,61,58,12,5,11,12,15,,LB,1


In [217]:
all_data.shape

(7662, 71)

In [218]:
all_data.isna().sum()

id                            0
short_name                    0
overall                       0
potential                     0
value_eur                     5
                           ... 
goalkeeping_positioning       0
goalkeeping_reflexes          0
goalkeeping_speed          6961
position                    767
test_train                    0
Length: 71, dtype: int64

In [219]:
# Identify features with missing values
features_with_missing_values = all_data.columns[all_data.isnull().any()].tolist()

# Display the count of missing values for each feature
missing_values_count = all_data[features_with_missing_values].isnull().sum()

# Print the results
print("Features with missing values:")
print(features_with_missing_values)

print("\nCount of missing values for each feature:")
print(missing_values_count)

Features with missing values:
['value_eur', 'club_loaned_from', 'club_joined', 'nation_jersey_number', 'release_clause_eur', 'player_tags', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'goalkeeping_speed', 'position']

Count of missing values for each feature:
value_eur                  5
club_loaned_from        7266
club_joined              396
nation_jersey_number    7198
release_clause_eur       401
player_tags             6806
player_traits           2874
pace                     701
shooting                 701
passing                  701
dribbling                701
defending                701
physic                   701
goalkeeping_speed       6961
position                 767
dtype: int64


In [220]:
#eliminate all with over 80% missing data
missing_percentages = all_data.isnull().sum() / len(all_data) * 100
threshold = 80
columns_to_drop = missing_percentages[missing_percentages >= threshold].index
all_data = all_data.drop(columns_to_drop, axis=1)
all_data.isna().sum()

id                           0
short_name                   0
overall                      0
potential                    0
value_eur                    5
                          ... 
goalkeeping_kicking          0
goalkeeping_positioning      0
goalkeeping_reflexes         0
position                   767
test_train                   0
Length: 67, dtype: int64

In [221]:
#Also drop the ones I don't think apport any value
columns1 = ['short_name', 'club_name', 'league_name', 'nationality_name']
all_data = all_data.drop(columns1, axis=1)

In [222]:
# Identify features with missing values
features_with_missing_values = all_data.columns[all_data.isnull().any()].tolist()

# Display the count of missing values for each feature
missing_values_count = all_data[features_with_missing_values].isnull().sum()

# Print the results
print("Features with missing values:")
print(features_with_missing_values)

print("\nCount of missing values for each feature:")
print(missing_values_count)

Features with missing values:
['value_eur', 'club_joined', 'release_clause_eur', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'position']

Count of missing values for each feature:
value_eur                5
club_joined            396
release_clause_eur     401
player_traits         2874
pace                   701
shooting               701
passing                701
dribbling              701
defending              701
physic                 701
position               767
dtype: int64


In [223]:
all_data.shape

(7662, 63)

In [224]:
all_data[features_with_missing_values].head()


Unnamed: 0,value_eur,club_joined,release_clause_eur,player_traits,pace,shooting,passing,dribbling,defending,physic,position
0,1400000.0,2021-07-12,2400000.0,Early Crosser,70.0,52.0,60.0,70.0,67.0,66.0,LB
1,1000000.0,2021-07-23,2000000.0,,65.0,38.0,58.0,60.0,63.0,67.0,LDM
2,1600000.0,2021-07-24,2300000.0,Speed Dribbler (AI),79.0,35.0,58.0,66.0,59.0,64.0,LWB
3,2300000.0,2018-02-08,2900000.0,Finesse Shot,64.0,74.0,51.0,68.0,24.0,76.0,LS
4,525000.0,2019-07-01,709000.0,Solid Player,74.0,53.0,59.0,53.0,61.0,69.0,LB


In [225]:
unique_values_count = all_data['player_traits'].nunique()
print(unique_values_count)

825


In [226]:
all_data.describe()

Unnamed: 0,id,overall,potential,value_eur,wage_eur,height_cm,weight_kg,league_level,club_jersey_number,club_contract_valid_until,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,test_train
count,7662.0,7662.0,7662.0,7657.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7261.0,6961.0,6961.0,6961.0,6961.0,6961.0,6961.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0,7662.0
mean,223616.285435,68.8426,72.048029,4331453.0,12030.253198,181.484338,75.552467,1.361655,16.52532,2022.763508,2.996346,2.451188,1.147481,8187938.0,68.568596,54.30125,60.085045,64.577791,55.609395,68.346215,52.883973,48.222527,55.069956,62.348865,44.775516,58.376273,50.381754,45.283738,56.815583,61.784652,65.905116,66.03811,65.287001,65.020752,64.850692,60.519838,67.255416,67.909554,68.229313,49.838684,59.815714,51.50496,53.130645,56.939441,49.895589,61.901592,51.349517,52.304098,49.800052,15.903289,15.672409,15.54568,15.813234,16.011877,0.899896
std,26710.529813,5.951663,6.074939,10450190.0,24385.866372,6.860952,7.042419,0.761018,15.487419,1.20703,0.685301,0.780693,0.470143,20446560.0,11.708722,14.354508,9.574193,9.348321,15.624753,8.509289,17.850296,19.569581,16.905886,13.281468,17.879807,17.897941,18.364759,17.835926,14.14999,15.398347,14.501858,14.401695,14.472439,7.930761,14.235664,13.076355,12.171742,15.096912,11.786433,19.300841,16.27264,20.123523,19.313219,13.559154,15.668827,10.886174,19.65836,20.918691,20.638862,17.466642,16.767293,16.319634,17.191808,17.877237,0.300159
min,41.0,48.0,53.0,25000.0,500.0,156.0,53.0,1.0,1.0,2021.0,1.0,1.0,1.0,39000.0,28.0,18.0,25.0,29.0,16.0,31.0,7.0,2.0,5.0,7.0,4.0,4.0,7.0,6.0,9.0,10.0,15.0,15.0,19.0,32.0,19.0,20.0,27.0,14.0,25.0,4.0,11.0,6.0,2.0,10.0,8.0,12.0,4.0,5.0,6.0,2.0,2.0,2.0,2.0,2.0,0.0
25%,207642.25,65.0,68.0,725000.0,2000.0,177.0,70.0,1.0,6.0,2022.0,3.0,2.0,1.0,1100000.0,62.0,43.0,54.0,59.0,43.0,63.0,42.0,33.0,48.0,58.0,31.0,53.0,37.0,32.0,50.0,58.0,58.0,59.0,57.0,60.0,56.0,51.0,60.0,63.0,61.0,35.0,51.0,33.0,42.0,48.0,40.0,56.0,35.0,34.0,31.0,8.0,8.0,8.0,8.0,8.0,1.0
50%,228152.5,68.0,72.0,1400000.0,4000.0,182.0,75.0,1.0,13.0,2023.0,3.0,2.0,1.0,2300000.0,70.0,57.0,60.0,65.0,61.0,69.0,58.0,52.0,58.0,65.0,46.0,63.0,53.0,44.0,60.0,65.0,68.0,69.0,67.0,65.0,67.0,62.0,69.0,71.0,69.0,55.0,63.0,59.0,59.0,59.0,50.0,63.0,58.0,61.0,58.0,11.0,11.0,11.0,11.0,11.0,1.0
75%,242513.5,72.0,76.0,3000000.0,12000.0,186.0,80.0,1.0,22.0,2024.0,3.0,3.0,1.0,5400000.0,77.0,65.0,67.0,71.0,67.0,74.0,66.0,64.0,67.0,70.0,59.0,70.0,64.0,60.0,66.0,71.0,76.0,76.0,75.0,70.0,75.0,70.0,76.0,77.0,76.0,65.0,72.0,67.0,67.0,67.0,62.0,69.0,66.0,68.0,66.0,14.0,14.0,14.0,14.0,14.0,1.0
max,264481.0,93.0,95.0,194000000.0,350000.0,203.0,103.0,5.0,99.0,2031.0,5.0,5.0,5.0,373500000.0,97.0,94.0,93.0,95.0,91.0,90.0,94.0,95.0,93.0,94.0,90.0,96.0,94.0,94.0,93.0,96.0,97.0,97.0,96.0,94.0,95.0,94.0,95.0,97.0,95.0,94.0,95.0,91.0,96.0,95.0,93.0,96.0,92.0,93.0,92.0,91.0,92.0,93.0,92.0,90.0,1.0


In [227]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7662 entries, 0 to 766
Data columns (total 63 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           7662 non-null   int64  
 1   overall                      7662 non-null   int64  
 2   potential                    7662 non-null   int64  
 3   value_eur                    7657 non-null   float64
 4   wage_eur                     7662 non-null   int64  
 5   birthday_date                7662 non-null   object 
 6   height_cm                    7662 non-null   int64  
 7   weight_kg                    7662 non-null   int64  
 8   league_level                 7662 non-null   int64  
 9   club_jersey_number           7662 non-null   int64  
 10  club_joined                  7266 non-null   object 
 11  club_contract_valid_until    7662 non-null   int64  
 12  preferred_foot               7662 non-null   object 
 13  weak_foot          

In [228]:
# Calculate the frequency of each trait
trait_frequency = all_data['player_traits'].value_counts(normalize=True)

# Map the traits to their frequencies
all_data['player_traits_encoded'] = all_data['player_traits'].map(trait_frequency)

# Print the result
print(all_data[['player_traits', 'player_traits_encoded']].head())

         player_traits  player_traits_encoded
0        Early Crosser               0.021303
1                  NaN                    NaN
2  Speed Dribbler (AI)               0.064536
3         Finesse Shot               0.013993
4         Solid Player               0.020677


In [229]:
#encoding dates

# Convert 'birthdate' to a datetime object
all_data['club_joined'] = pd.to_datetime(all_data['club_joined'])
all_data['birthday_date'] = pd.to_datetime(all_data['birthday_date'])

# Define the reference date (e.g., the current date)
reference_date = datetime.now()

# Calculate the age by subtracting the birthdate from the reference date
all_data['yearinclub'] = (reference_date - all_data['club_joined']).dt.days // 365
all_data['age'] = (reference_date - all_data['birthday_date']).dt.days // 365

all_data['years_until_contract_expires'] =  all_data.club_contract_valid_until - datetime.today().year

# Print the result
print(all_data[['yearinclub', 'age', 'years_until_contract_expires']].head())


   yearinclub  age  years_until_contract_expires
0         2.0   33                             3
1         2.0   27                            -1
2         2.0   22                             3
3         5.0   31                             0
4         4.0   33                            -1


In [230]:
def extract_bodytype(bodytype_string):
    return bodytype_string.split()[0]

# Apply the function to the 'bodytype' column
all_data['body_type'] = all_data['body_type'].apply(extract_bodytype)

print(all_data['body_type'].unique())



['Normal' 'Lean' 'Stocky' 'Unique']


In [231]:
#work_rate
print(all_data['work_rate'].unique())


['Medium/High' 'Medium/Medium' 'High/Medium' 'High/Low' 'High/High'
 'Medium/Low' 'Low/Medium' 'Low/High' 'Low/Low']


In [232]:
encoder = LabelEncoder()
all_data['body_type_encoded'] = encoder.fit_transform(all_data['body_type'])
all_data['work_rate_encoded'] = encoder.fit_transform(all_data['work_rate'])
all_data['preferred_foot_encoded'] = encoder.fit_transform(all_data['preferred_foot'])
all_data['real_face_encoded'] = encoder.fit_transform(all_data['real_face'])

In [233]:
# Identify features with missing values
features_with_missing_values = all_data.columns[all_data.isnull().any()].tolist()

# Display the count of missing values for each feature
missing_values_count = all_data[features_with_missing_values].isnull().sum()

# Print the results
print("Features with missing values:")
print(features_with_missing_values)

print("\nCount of missing values for each feature:")
print(missing_values_count)

Features with missing values:
['value_eur', 'club_joined', 'release_clause_eur', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'position', 'player_traits_encoded', 'yearinclub']

Count of missing values for each feature:
value_eur                   5
club_joined               396
release_clause_eur        401
player_traits            2874
pace                      701
shooting                  701
passing                   701
dribbling                 701
defending                 701
physic                    701
position                  767
player_traits_encoded    2874
yearinclub                396
dtype: int64


In [234]:
#we use the KNNimputer to imputate the missing values in the numeric variables
def impute(data,columns):
    # Create KNNImputer object with k=8
    imputer = KNNImputer(n_neighbors=8)
    # Perform KNN imputation
    data[columns] = imputer.fit_transform(data[columns])
    return data[columns]

imputer_columns=['value_eur', 'release_clause_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'player_traits_encoded', 'yearinclub']

impute(all_data,imputer_columns)
all_data.isna().sum()

id                              0
overall                         0
potential                       0
value_eur                       0
wage_eur                        0
                               ..
years_until_contract_expires    0
body_type_encoded               0
work_rate_encoded               0
preferred_foot_encoded          0
real_face_encoded               0
Length: 71, dtype: int64

In [235]:
all_data.head()

Unnamed: 0,id,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,league_level,club_jersey_number,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,work_rate,body_type,real_face,release_clause_eur,player_traits,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position,test_train,player_traits_encoded,yearinclub,age,years_until_contract_expires,body_type_encoded,work_rate_encoded,preferred_foot_encoded,real_face_encoded
0,216302,71,71,1400000.0,10000,1989-12-28,176,73,1,29,2021-07-12,2026,Right,5,3,1,Medium/High,Normal,No,2400000.0,Early Crosser,70.0,52.0,60.0,70.0,67.0,66.0,64,40,56,65,41,68,64,58,58,68,71,70,87,61,86,67,77,86,58,66,56,79,53,48,58,66,65,66,65,14,11,12,12,12,LB,1,0.021303,2.0,33,3,1,6,1,0
1,237867,65,71,1000000.0,2000,1996-10-23,183,73,1,4,2021-07-23,2022,Right,3,2,1,Medium/Medium,Lean,No,2000000.0,,65.0,38.0,58.0,60.0,63.0,67.0,50,31,59,64,31,57,36,41,64,65,64,65,57,62,67,49,62,71,66,42,66,66,48,62,46,59,65,61,58,10,13,7,6,11,LDM,1,0.021982,2.0,27,-1,0,8,1,0
2,253472,65,77,1600000.0,2000,2001-09-27,178,69,1,19,2021-07-24,2026,Left,3,3,1,High/Medium,Normal,No,2300000.0,Speed Dribbler (AI),79.0,35.0,58.0,66.0,59.0,64.0,59,30,53,60,37,65,56,35,55,64,82,77,71,62,75,50,65,67,61,28,69,62,58,60,31,65,60,58,59,10,10,8,10,11,LWB,1,0.064536,2.0,22,3,1,2,0,0
3,223994,72,72,2300000.0,5000,1992-02-26,188,81,1,9,2018-02-08,2023,Right,3,3,1,High/Medium,Lean,No,2900000.0,Finesse Shot,64.0,74.0,51.0,68.0,24.0,76.0,32,78,75,59,70,65,48,53,45,74,61,66,64,67,53,71,71,75,87,69,52,18,76,60,75,75,16,22,19,16,15,13,8,9,LS,1,0.013993,5.0,31,0,0,2,1,0
4,251635,65,65,525000.0,3000,1990-06-12,179,74,1,30,2019-07-01,2022,Left,2,2,1,Medium/High,Normal,No,709000.0,Solid Player,74.0,53.0,59.0,53.0,61.0,69.0,64,38,49,63,46,43,53,58,55,56,71,76,70,60,71,72,80,78,70,67,53,63,60,52,52,58,64,61,58,12,5,11,12,15,LB,1,0.020677,4.0,33,-1,1,6,0,0


In [236]:
#work_rate
print(all_data['international_reputation'].unique())

[1 2 3 4 5]


In [237]:
# Identify numerical columns (excluding the target column)
nocolumns = ['position']
numerical_columns = all_data.select_dtypes(include=['number']).columns.tolist()
numerical_columns = [col for col in numerical_columns if col != nocolumns]

# Normalize numerical columns using Min-Max scaling
scaler = MinMaxScaler()
all_data[numerical_columns] = scaler.fit_transform(all_data[numerical_columns])

# Print the result
print(all_data.head())



         id   overall  potential  value_eur  wage_eur birthday_date  \
0  0.817807  0.511111   0.428571   0.007089  0.027182    1989-12-28   
1  0.899357  0.377778   0.428571   0.005026  0.004292    1996-10-23   
2  0.958369  0.377778   0.571429   0.008120  0.004292    2001-09-27   
3  0.846895  0.533333   0.452381   0.011728  0.012876    1992-02-26   
4  0.951422  0.377778   0.285714   0.002578  0.007153    1990-06-12   

   height_cm  weight_kg  league_level  club_jersey_number club_joined  \
0   0.425532       0.40           0.0            0.285714  2021-07-12   
1   0.574468       0.40           0.0            0.030612  2021-07-23   
2   0.468085       0.32           0.0            0.183673  2021-07-24   
3   0.680851       0.56           0.0            0.081633  2018-02-08   
4   0.489362       0.42           0.0            0.295918  2019-07-01   

   club_contract_valid_until preferred_foot  weak_foot  skill_moves  \
0                        0.5          Right       1.00         

In [238]:
#we start by normalizing the numerical data
def normalize_data(data):
    min_value = min(data)
    max_value = max(data)
    normalized_data = []

    for value in data:
        normalized_value = (value - min_value) / (max_value - min_value)
        normalized_data.append(normalized_value)

    return normalized_data

all_data['overall'] = normalize_data(all_data['overall'])
all_data['potential'] = normalize_data(all_data['potential'])
all_data['value_eur'] = normalize_data(all_data['value_eur'])
all_data['wage_eur'] = normalize_data(all_data['wage_eur'])
all_data['height_cm'] = normalize_data(all_data['height_cm'])
all_data['weight_kg'] = normalize_data(all_data['weight_kg'])
all_data['league_level'] = normalize_data(all_data['league_level'])
all_data['club_jersey_number'] = normalize_data(all_data['club_jersey_number'])
all_data['weak_foot'] = normalize_data(all_data['weak_foot'])
all_data['skill_moves'] = normalize_data(all_data['skill_moves'])
all_data['international_reputation'] = normalize_data(all_data['international_reputation'])
all_data['release_clause_eur'] = normalize_data(all_data['release_clause_eur'])
all_data['pace'] = normalize_data(all_data['pace'])
all_data['shooting'] = normalize_data(all_data['shooting'])
all_data['passing'] = normalize_data(all_data['passing'])
all_data['dribbling'] = normalize_data(all_data['dribbling'])
all_data['defending'] = normalize_data(all_data['defending'])
all_data['physic'] = normalize_data(all_data['physic'])
display(all_data)

Unnamed: 0,id,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,league_level,club_jersey_number,club_joined,club_contract_valid_until,preferred_foot,weak_foot,skill_moves,international_reputation,work_rate,body_type,real_face,release_clause_eur,player_traits,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,position,test_train,player_traits_encoded,yearinclub,age,years_until_contract_expires,body_type_encoded,work_rate_encoded,preferred_foot_encoded,real_face_encoded
0,0.817807,0.511111,0.428571,0.007089,0.027182,1989-12-28,0.425532,0.40,0.00,0.285714,2021-07-12,0.5,Right,1.00,0.50,0.0,Medium/High,Normal,No,0.006322,Early Crosser,0.608696,0.447368,0.514706,0.621212,0.680000,0.593220,0.655172,0.408602,0.579545,0.666667,0.430233,0.695652,0.655172,0.590909,0.583333,0.674419,0.682927,0.670732,0.883117,0.467742,0.881579,0.635135,0.735294,0.867470,0.471429,0.688889,0.535714,0.858824,0.542553,0.447059,0.588235,0.642857,0.693182,0.693182,0.686047,0.134831,0.100000,0.109890,0.111111,0.113636,LB,1.0,0.327922,0.000000,0.538462,0.5,0.333333,0.750,1.0,0.0
1,0.899357,0.377778,0.428571,0.005026,0.004292,1996-10-23,0.574468,0.40,0.00,0.030612,2021-07-23,0.1,Right,0.50,0.25,0.0,Medium/Medium,Lean,No,0.005251,,0.536232,0.263158,0.485294,0.469697,0.626667,0.610169,0.494253,0.311828,0.613636,0.655172,0.313953,0.576087,0.333333,0.397727,0.654762,0.639535,0.597561,0.609756,0.493506,0.483871,0.631579,0.391892,0.514706,0.686747,0.585714,0.422222,0.654762,0.705882,0.489362,0.611765,0.447059,0.559524,0.693182,0.636364,0.604651,0.089888,0.122222,0.054945,0.044444,0.102273,LDM,1.0,0.338474,0.000000,0.307692,0.1,0.000000,1.000,1.0,0.0
2,0.958369,0.377778,0.571429,0.008120,0.004292,2001-09-27,0.468085,0.32,0.00,0.183673,2021-07-24,0.5,Left,0.50,0.50,0.0,High/Medium,Normal,No,0.006054,Speed Dribbler (AI),0.739130,0.223684,0.485294,0.560606,0.573333,0.559322,0.597701,0.301075,0.545455,0.609195,0.383721,0.663043,0.563218,0.329545,0.547619,0.627907,0.817073,0.756098,0.675325,0.483871,0.736842,0.405405,0.558824,0.638554,0.514286,0.266667,0.690476,0.658824,0.595745,0.588235,0.270588,0.630952,0.636364,0.602273,0.616279,0.089888,0.088889,0.065934,0.088889,0.102273,LWB,1.0,1.000000,0.000000,0.115385,0.5,0.333333,0.250,0.0,0.0
3,0.846895,0.533333,0.452381,0.011728,0.012876,1992-02-26,0.680851,0.56,0.00,0.081633,2018-02-08,0.2,Right,0.50,0.50,0.0,High/Medium,Lean,No,0.007661,Finesse Shot,0.521739,0.736842,0.382353,0.590909,0.106667,0.762712,0.287356,0.817204,0.795455,0.597701,0.767442,0.663043,0.471264,0.534091,0.428571,0.744186,0.560976,0.621951,0.584416,0.564516,0.447368,0.689189,0.647059,0.734940,0.885714,0.722222,0.488095,0.141176,0.787234,0.588235,0.788235,0.750000,0.136364,0.193182,0.151163,0.157303,0.144444,0.120879,0.066667,0.079545,LS,1.0,0.214286,0.157895,0.461538,0.2,0.000000,0.250,1.0,0.0
4,0.951422,0.377778,0.285714,0.002578,0.007153,1990-06-12,0.489362,0.42,0.00,0.295918,2019-07-01,0.1,Left,0.25,0.25,0.0,Medium/High,Normal,No,0.001794,Solid Player,0.666667,0.460526,0.500000,0.363636,0.600000,0.644068,0.655172,0.387097,0.500000,0.643678,0.488372,0.423913,0.528736,0.590909,0.547619,0.534884,0.682927,0.743902,0.662338,0.451613,0.684211,0.702703,0.779412,0.771084,0.642857,0.700000,0.500000,0.670588,0.617021,0.494118,0.517647,0.547619,0.681818,0.636364,0.604651,0.112360,0.033333,0.098901,0.111111,0.147727,LB,1.0,0.318182,0.105263,0.538462,0.1,0.333333,0.750,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,0.777341,0.533333,0.452381,0.011728,0.047210,1992-01-10,0.191489,0.24,0.00,0.020408,2021-07-17,0.4,Left,0.50,0.50,0.0,Medium/Low,Lean,Yes,0.010071,Speed Dribbler (AI),0.782609,0.631579,0.632353,0.696970,0.240000,0.152542,0.701149,0.655914,0.477273,0.724138,0.627907,0.782609,0.724138,0.590909,0.619048,0.720930,0.865854,0.768293,0.870130,0.532258,0.894737,0.702703,0.558824,0.433735,0.085714,0.688889,0.380952,0.341176,0.755319,0.717647,0.729412,0.619048,0.295455,0.318182,0.302326,0.123596,0.111111,0.109890,0.088889,0.079545,,0.0,1.000000,0.000000,0.461538,0.4,0.000000,0.875,0.0,1.0
763,0.845980,0.444444,0.500000,0.008635,0.004292,1996-04-17,0.510638,0.34,0.00,0.010204,2019-01-01,0.1,Right,0.25,0.25,0.0,Low/High,Stocky,No,0.006857,,0.579710,0.342105,0.367647,0.303030,0.653333,0.847458,0.517241,0.397849,0.750000,0.597701,0.488372,0.326087,0.321839,0.261364,0.523810,0.581395,0.621951,0.658537,0.597403,0.500000,0.500000,0.432432,0.941176,0.771084,0.857143,0.455556,0.702381,0.705882,0.489362,0.364706,0.341176,0.535714,0.681818,0.681818,0.662791,0.067416,0.088889,0.065934,0.144444,0.079545,,0.0,0.188312,0.105263,0.307692,0.1,0.666667,0.375,1.0,0.0
764,0.727609,0.377778,0.285714,0.003480,0.015737,1990-10-18,0.617021,0.54,0.75,0.081633,2021-07-01,0.2,Right,0.75,0.50,0.0,Low/Low,Stocky,No,0.003109,Power Header,0.536232,0.605263,0.397059,0.484848,0.160000,0.898305,0.379310,0.677419,0.681818,0.586207,0.662791,0.597826,0.586207,0.386364,0.523810,0.604651,0.621951,0.609756,0.584416,0.500000,0.618421,0.648649,0.823529,0.650602,0.928571,0.622222,0.904762,0.200000,0.670213,0.541176,0.564706,0.583333,0.204545,0.238636,0.220930,0.101124,0.100000,0.087912,0.055556,0.090909,,0.0,0.626623,0.000000,0.538462,0.2,0.666667,0.500,1.0,0.0
765,0.727292,0.733333,0.666667,0.064312,0.050072,1988-02-12,0.574468,0.46,0.00,0.295918,2020-09-29,0.2,Right,0.50,0.25,0.5,Medium/High,Normal,Yes,0.066837,"Solid Player, Dives Into Tackles (AI), Leaders...",0.507246,0.513158,0.544118,0.454545,0.893333,0.847458,0.517241,0.559140,0.840909,0.758621,0.616279,0.510870,0.494253,0.375000,0.726190,0.651163,0.548780,0.609756,0.506494,0.758065,0.513158,0.662162,0.897059,0.734940,0.785714,0.577778,0.940476,0.917647,0.489362,0.505882,0.435294,0.797619,0.909091,0.886364,0.860465,0.112360,0.033333,0.065934,0.100000,0.113636,,0.0,0.003247,0.052632,0.615385,0.2,0.333333,0.750,1.0,1.0


In [239]:
#we deconcatenate again the test data and train data from the all_data
train_data = all_data.loc[all_data.test_train == 1]
test_data = all_data.loc[all_data.test_train == 0]

In [240]:
#We check that it deconcatenated properly and we are working with the correct dataframes
train_data.shape,test_data.shape

((6895, 71), (767, 71))

In [241]:
all_columns = all_data.columns.tolist()
print("All column names:", all_columns)

All column names: ['id', 'overall', 'potential', 'value_eur', 'wage_eur', 'birthday_date', 'height_cm', 'weight_kg', 'league_level', 'club_jersey_number', 'club_joined', 'club_contract_valid_until', 'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation', 'work_rate', 'body_type', 'real_face', 'release_clause_eur', 'player_traits', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_mar

In [242]:
from sklearn.model_selection import train_test_split
columns_for_prediction = ['overall', 'potential', 'value_eur', 'wage_eur', 'height_cm', 'weight_kg', 'league_level', 'club_jersey_number', 'weak_foot', 'skill_moves', 'international_reputation', 'release_clause_eur', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'player_traits_encoded', 'yearinclub', 'age', 'years_until_contract_expires', 'body_type_encoded', 'work_rate_encoded', 'preferred_foot_encoded', 'real_face_encoded']
X = train_data[columns_for_prediction] # Features
y = train_data['position']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [243]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Train a RandomForestClassifier to get feature importances
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Get feature importances
feature_importances = clf.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature importances
print("Feature importances:")
print(feature_importance_df)

# Select the top k features based on importance
k = 10  # You can choose your desired number of top features
selected_features = feature_importance_df.head(k)['Feature'].tolist()

# Print the selected features
print(f"\nTop {k} selected features:")
print(selected_features)

Feature importances:
                         Feature  Importance
18            attacking_crossing    0.035169
20    attacking_heading_accuracy    0.031265
58        preferred_foot_encoded    0.030226
40         mentality_positioning    0.027728
19           attacking_finishing    0.025853
46      defending_sliding_tackle    0.024372
51          goalkeeping_reflexes    0.023202
39       mentality_interceptions    0.022867
16                     defending    0.022842
36                power_strength    0.022430
48          goalkeeping_handling    0.022097
45     defending_standing_tackle    0.022012
23               skill_dribbling    0.021682
44   defending_marking_awareness    0.021282
41              mentality_vision    0.020020
47            goalkeeping_diving    0.019549
12                          pace    0.019118
22             attacking_volleys    0.019038
29         movement_sprint_speed    0.018937
37              power_long_shots    0.018762
26            skill_long_passing  

In [252]:
best_columns = ['attacking_crossing', 'attacking_heading_accuracy', 'preferred_foot_encoded', 'mentality_positioning', 'attacking_finishing', 'defending_sliding_tackle', 'goalkeeping_reflexes', 'mentality_interceptions', 'defending', 'power_strength']
target = ['position']
X = train_data[best_columns] # Features
y = train_data[target]  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [253]:
#Random Forest and Gradient Boosting,
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut



# Step 3: Create the StratifiedKFold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 4: Create a Logistic Regression model
logistic = LogisticRegression()
randomforest = RandomForestClassifier()

import time

# Function to evaluate model and measure time
def evaluate_model_with_time(model, X, y, cv):
    start_time = time.time()
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return scores.mean(), elapsed_time

# Initialize the models
logistic_regression = LogisticRegression(max_iter=1000)
random_forest = RandomForestClassifier()

# Create a dictionary to store the results and time spent
results_linear = {}
results_rf = {}
times_linear = {}
times_rf = {}

# Method 1: No cross-validation
results_linear['No CV'], times_linear['No CV'] = evaluate_model_with_time(logistic_regression, X, y, cv=None)
results_rf['No CV'], times_rf['No CV'] = evaluate_model_with_time(random_forest, X, y, cv=None)





  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [254]:
# Method 4: Leave-One-Out cross-validation (LOOCV)
loo = LeaveOneOut()
results_linear['LOOCV'], times_linear['LOOCV'] = evaluate_model_with_time(logistic_regression, X, y, cv=loo)
results_rf['LOOCV'], times_rf['LOOCV'] = evaluate_model_with_time(random_forest, X, y, cv=loo)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [255]:
# Method 2: k-fold cross-validation (e.g., k=5)
kf = KFold(n_splits=5)
results_linear['K-fold'], times_linear['K-fold'] = evaluate_model_with_time(logistic_regression, X, y, cv=kf)
results_rf['K-fold'], times_rf['K-fold'] = evaluate_model_with_time(random_forest, X, y, cv=kf)

# Method 3: Stratified k-fold cross-validation (e.g., k=5)
skf = StratifiedKFold(n_splits=5)
results_linear['Stratified'], times_linear['Stratified'] = evaluate_model_with_time(logistic_regression, X, y, cv=skf)
results_rf['Stratified'], times_rf['Stratified'] = evaluate_model_with_time(random_forest, X, y, cv=skf)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [256]:
# Convert the results and times to DataFrames for easier comparison
results_ln = pd.DataFrame(results_linear, index=['Linear Regression'])
results_rf = pd.DataFrame(results_rf, index=['Random Forest'])
times_ln = pd.DataFrame(times_linear, index=['Linear Regression'])
times_rf = pd.DataFrame(times_rf, index=['Random Forest'])

# Display the accuracy and time results
print("Accuracy Results:")
display(pd.concat([results_ln, results_rf]))
print("\nTime Spent:")
display(pd.concat([times_ln, times_rf]))

Accuracy Results:


Unnamed: 0,No CV,K-fold,Stratified
Linear Regression,0.440174,0.439739,0.440174
Random Forest,0.417114,0.415373,0.42335



Time Spent:


Unnamed: 0,No CV,K-fold,Stratified
Linear Regression,11.597974,11.508308,11.282876
Random Forest,28.12045,27.062092,26.364193


In [257]:
kaggle_predictions = random_forest.predict(test_data[target])

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Produce .csv for kaggle testing
test_predictions_submit = pd.DataFrame({"id": test_df["id"], "position": test_predictions})
test_predictions_submit.to_csv("test_predictions_submit.csv", index = False)

NameError: name 'test_df' is not defined