In [1]:
# Import dependencies
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
# Load File
path = "Resources/players_fifa22.csv"
df = pd.read_csv(path)

In [3]:
# df.columns

In [4]:
# Dropping all Goal Keepers
df = df[(df["BestPosition"] != "GK")]

In [5]:
# Changing High Medium and Low Values
df = df.replace(["High", "Medium", "Low"], [2,1,0])

In [6]:
# Create List of Columns to drop
columns_to_drop = ["ID","Name","FullName","PhotoUrl","Nationality","Potential","Growth","TotalStats",
                    "BaseStats","Positions","BestPosition","Club","ClubPosition","ContractUntil","ClubNumber",
                    "ClubJoined","OnLoad","NationalTeam","NationalPosition","NationalNumber", "IntReputation",
                    "PreferredFoot",'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
                    'STRating', 'LWRating', 'LFRating', 'CFRating', 'RFRating', 'RWRating',
                    'CAMRating', 'LMRating', 'CMRating', 'RMRating', 'LWBRating',
                    'CDMRating', 'RWBRating', 'LBRating', 'CBRating', 'RBRating',
                    'GKRating']

# Drop Columns
df = df.drop(columns=columns_to_drop)

In [7]:
# df.columns

In [8]:
# # Create Correlation Matrix
# name = df.corr()
# plt.subplots(figsize=(100,100))
# sn.heatmap(name, annot=True)  
# plt.savefig("CorrelationHeatMap.png")
# plt.show()

In [9]:
# Dropping higly co-relateed columns
drop_columns = ["Finishing","Dribbling",'Acceleration','SprintSpeed',"StandingTackle",
                'Marking', 'StandingTackle', 'SlidingTackle',"Reactions"]

#Create new Dataframe
new_df = df.drop(columns=drop_columns)
# Drop all null values
new_df=new_df.dropna()
# Save new json
# new_df.to_json('Resources/FIFA_df_final.json')

In [10]:
# Use get dummies to encode the data
new_df2=pd.get_dummies(new_df)

In [11]:
# Create variables for features and targets
y = new_df2["Overall"]
y1 = new_df2["ValueEUR"]
y2 = new_df2["WageEUR"]
y3 = new_df2["ReleaseClause"]
X = new_df2.drop(columns=["Overall", "ValueEUR", "WageEUR", "ReleaseClause"])
X.head()

Unnamed: 0,Age,Height,Weight,WeakFoot,SkillMoves,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,PassingTotal,...,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure
0,34,170,72,4,4,1,0,85,92,91,...,68,72,69,94,44,40,93,95,75,96
1,32,185,81,4,4,2,1,78,92,79,...,85,76,86,87,81,49,95,81,90,88
2,36,187,83,4,5,2,0,87,94,80,...,95,77,77,93,63,29,95,76,88,95
3,22,182,73,4,5,2,0,97,88,80,...,78,88,77,82,62,38,92,82,79,88
5,30,181,70,5,4,2,2,76,86,93,...,63,89,74,91,76,66,88,94,83,89


In [26]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# Scale the data using Standard Scaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Get a random forest classifier score# Use a Random FOrest Regressor to create a prediction model
clr0 = RandomForestRegressor(max_depth=5)
clr0.fit(X_train_scaled, y_train)
clr0.score(X_test_scaled, y_test)

0.8917676501619792

In [13]:
sorted(list(zip(clr0.feature_importances_, X_train.columns)), reverse=True)

[(0.43943733240261634, 'Composure'),
 (0.2269442315586971, 'BallControl'),
 (0.16864431666952012, 'DefendingTotal'),
 (0.029122298652828064, 'Positioning'),
 (0.027734741337987156, 'DribblingTotal'),
 (0.01938895838272466, 'Crossing'),
 (0.0157034200073617, 'ShootingTotal'),
 (0.009946882667326342, 'Stamina'),
 (0.00918244814934244, 'PhysicalityTotal'),
 (0.007996915057832074, 'PaceTotal'),
 (0.007017841871648413, 'Interceptions'),
 (0.006781795101841874, 'ShortPassing'),
 (0.004648967503498373, 'HeadingAccuracy'),
 (0.004119555565734683, 'PassingTotal'),
 (0.0025099743908691408, 'Age'),
 (0.0023277876209870105, 'Aggression'),
 (0.0018368279551623796, 'Vision'),
 (0.0017212933052086058, 'ShotPower'),
 (0.001589161647799064, 'LongShots'),
 (0.001474456115459279, 'LongPassing'),
 (0.0014478237340171222, 'Strength'),
 (0.0013456540614200042, 'Agility'),
 (0.0012964968832046037, 'Jumping'),
 (0.0010710131636754662, 'Balance'),
 (0.001056915868976611, 'Volleys'),
 (0.001048707145386165, 'Pe

In [14]:
# View Co-realtion matrix for X and Y
plt.figure(figsize=(15, 15))
sn.heatmap(pd.concat([X,y], axis=1).corr())

In [18]:
# Predicting Values in Euros
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.1)
# Scale the data using Standard Scaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Use a Random FOrest Regressor to create a prediction model
clr1 = RandomForestRegressor(max_depth=8)
clr1.fit(X_train_scaled, y_train)
clr1.score(X_test_scaled, y_test)

0.8604566703149253

In [16]:
sorted(list(zip(clr1.feature_importances_, X_train.columns)), reverse=True)

[(0.46595393126508966, 'BallControl'),
 (0.12741643773891814, 'DefendingTotal'),
 (0.0562599677926727, 'Positioning'),
 (0.049912475282610946, 'Age'),
 (0.04347106123146096, 'DribblingTotal'),
 (0.03543798023547755, 'Stamina'),
 (0.031329158301559835, 'ShootingTotal'),
 (0.026935299100582878, 'ShortPassing'),
 (0.023997399914976625, 'PaceTotal'),
 (0.014968164814815053, 'Interceptions'),
 (0.013173013310417127, 'HeadingAccuracy'),
 (0.010298378762413312, 'PhysicalityTotal'),
 (0.009668945149749386, 'Composure'),
 (0.009157865284589178, 'Vision'),
 (0.007978585108828692, 'Crossing'),
 (0.007161010797652807, 'Volleys'),
 (0.006623853728043848, 'LongPassing'),
 (0.005067415534489828, 'Penalties'),
 (0.0050347080753539875, 'Aggression'),
 (0.00488332337414837, 'Strength'),
 (0.0048505892251492965, 'FKAccuracy'),
 (0.004769365887473061, 'PassingTotal'),
 (0.0046112710299682155, 'ShotPower'),
 (0.004531330913357827, 'LongShots'),
 (0.004338752501691801, 'Weight'),
 (0.004063655668428947, 'Ju

In [22]:
# Predicting Wages in Euros
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.1)
# Scale the data using Standard Scaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Use a Random FOrest Regressor to create a prediction model
clr2 = RandomForestRegressor(max_depth=5)
clr2.fit(X_train_scaled, y_train)
clr2.score(X_test_scaled, y_test)

0.6997200979802116

In [18]:
sorted(list(zip(clr2.feature_importances_, X_train.columns)), reverse=True)

[(0.42563291022687477, 'BallControl'),
 (0.1599899154499234, 'DefendingTotal'),
 (0.03878574169779269, 'Positioning'),
 (0.037285055884760374, 'Composure'),
 (0.032979217268954454, 'ShortPassing'),
 (0.03270492023502084, 'DribblingTotal'),
 (0.02132394219128646, 'ShotPower'),
 (0.020057920981013257, 'Interceptions'),
 (0.017032985615452204, 'ShootingTotal'),
 (0.01572696021921087, 'PaceTotal'),
 (0.01444732855357998, 'HeadingAccuracy'),
 (0.014311662424204225, 'Jumping'),
 (0.014134962760443726, 'Crossing'),
 (0.013124524695591331, 'Age'),
 (0.01254988921448507, 'Stamina'),
 (0.011678523091323976, 'Aggression'),
 (0.009979224381912613, 'FKAccuracy'),
 (0.009149085661491608, 'LongPassing'),
 (0.008894502935912305, 'Penalties'),
 (0.00862014823202936, 'PhysicalityTotal'),
 (0.008198458057442235, 'Weight'),
 (0.00819188221783418, 'Height'),
 (0.00811099581149969, 'LongShots'),
 (0.007922876523693613, 'Agility'),
 (0.00767319755756804, 'Volleys'),
 (0.0072357043662660036, 'PassingTotal'),


In [29]:
# Predicting Release Clause in Euros
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y3, test_size=0.1)
# Scale the data using Standard Scaler
scaler = StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Use a Random FOrest Regressor to create a prediction model
clr3 = RandomForestRegressor(max_depth=10)
clr3.fit(X_train_scaled, y_train)
clr3.score(X_test_scaled, y_test)

0.8135263628482414

In [20]:
sorted(list(zip(clr3.feature_importances_, X_train.columns)), reverse=True)

[(0.43462537343388874, 'BallControl'),
 (0.12730224071114, 'DefendingTotal'),
 (0.06108750883186848, 'DribblingTotal'),
 (0.05148270048478309, 'Age'),
 (0.03943650988889615, 'Positioning'),
 (0.034295148569400254, 'PaceTotal'),
 (0.0314092885802561, 'Stamina'),
 (0.026162231712030866, 'Interceptions'),
 (0.024707954613635962, 'ShortPassing'),
 (0.022826444261003654, 'ShootingTotal'),
 (0.013471374672352926, 'Composure'),
 (0.011178199147955896, 'Vision'),
 (0.010502212048453856, 'HeadingAccuracy'),
 (0.010054841144596, 'Crossing'),
 (0.008922234602098933, 'PhysicalityTotal'),
 (0.008506175626769958, 'Volleys'),
 (0.007703978025285065, 'PassingTotal'),
 (0.007689298223817646, 'Aggression'),
 (0.007444875747171801, 'LongPassing'),
 (0.006644278392341153, 'ShotPower'),
 (0.006409533497839361, 'LongShots'),
 (0.006046209772351085, 'Strength'),
 (0.00594601867493355, 'Penalties'),
 (0.0054154696279435175, 'Jumping'),
 (0.004945292333663546, 'Height'),
 (0.004833801296548928, 'FKAccuracy'),


In [27]:
# Save Models and scaler using pickle
# pickle.dump(scaler, open("models/scaler.sav", "wb"))
# pickle.dump(clr0, open("models/clr0.sav", "wb"))
# pickle.dump(clr1, open("models/clr1.sav", "wb"))
# pickle.dump(clr2, open("models/clr2.sav", "wb"))
# pickle.dump(clr3, open("models/clr3.sav", "wb"))