<a href="https://colab.research.google.com/github/arjunpathy/fifa_wage_predictor/blob/main/1982435.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Global Variables

In [None]:
GDRIVE_FILE_URL =  'gdrive/My Drive/players_22.csv'
FILE_NAME = "CleanedDataSet.csv"
FILTER_PLAYER_POSITION = "DEFENDER" # "DEFENDER" "ATTACKER"

# Requirements


In [None]:
!pip install pyspark


import numpy as np
import pandas as pd
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils

from matplotlib import pyplot as plt

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 44.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=e0c839838ccbf0c2c9a5c133311c18980a317216563569a9adf77d65c9eeab4a
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [None]:
sc = SparkContext('local')
spark = SparkSession(sc)

# Finding k best Input Features using SelectKBest() library

In [None]:
def kBestInputFeatures(k, position, data):
    from sklearn.feature_selection import SelectKBest, f_regression

    data = data.filter(data.position == FILTER_PLAYER_POSITION).toPandas()


    cols =["age","overall" ,"potential","pace","shooting","passing","dribbling","defending","physic",
           "attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing",
           "attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing",
           "skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility",
           "movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina",
           "power_strength","power_long_shots","mentality_aggression","mentality_interceptions",
           "mentality_positioning","mentality_vision","mentality_penalties","mentality_composure",
           "defending_marking_awareness","defending_sliding_tackle",
           "goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning",
           "goalkeeping_reflexes"]
    
    X1 = data[cols]
    y1 = data[['wage_eur']]
    
    select = SelectKBest(score_func=f_regression, k = k )
    z = select.fit_transform(X1, y1) 

    print("After selecting best "+ str(k) +" features:", z.shape) 

    filter = select.get_support()
    features = np.array(X1.columns)


    # print("All features:")
    # print(features)

    print("\nSelected best " ,k,'features for ',position)
    print(features[filter])
    # print(z)

# Data Cleanup

In [None]:
def iqr_capping(df,factor,spark):
    q1= df['wage_eur'].quantile(0.25)
    q3 = df['wage_eur'].quantile(0.75)
    iqr = q3-q1
    upper_whisker=q3+(factor*iqr)
    lower_whisker = q1-(factor*iqr)
    df['wage_eur'] = np.where(df['wage_eur']>upper_whisker,upper_whisker,
                 np.where(df['wage_eur']<lower_whisker,lower_whisker,df['wage_eur']))
    df.boxplot(['wage_eur'])
    return spark.createDataFrame(df) 

def dataCleanupAndFromat(filename):    
    cols =['sofifa_id','short_name','club_name','nationality_name','age',"overall" ,"potential","pace","shooting","passing","dribbling","defending","physic",
    "attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing",
    "attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing",
    "skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility",
    "movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina",
    "power_strength","power_long_shots","mentality_aggression","mentality_interceptions",
    "mentality_positioning","mentality_vision","mentality_penalties","mentality_composure",
    "defending_marking_awareness","defending_standing_tackle","defending_sliding_tackle",
    "goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning",
    "goalkeeping_reflexes",'wage_eur']
    
    df  = pd.read_csv(filename)

    defense = ["LWB","RWB","CDM","LB","RB","CB"]
    attack = ['CF',"CM",'ST',"CAM",'LW','RM','RW','LM']
    positions = []
    
    for pos in df["player_positions"]:
        pos = pos.replace(" ", "").split(",")
        pos = "ATTACKER" if pos[0] in attack else "DEFENDER" if pos[0] in defense else "GOAL KEEPER"
        positions.append(pos)

    df['position'] = positions
    cols.append("position")
    df = df[cols]
    df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

    # To write a csv file
    df.to_csv(FILE_NAME, sep=',', encoding='utf-8')
    print("Formatted Data Written to the file : " , FILE_NAME)

# Hyperparameter Tuning for Random Forest

In [None]:

def bestRFParameters(train_features, train_label, estimators, depth):
    from sklearn.ensemble import RandomForestRegressor
    rfc = RandomForestRegressor()
    parameters = {
        "n_estimators": estimators,
        "max_depth": depth
    }
    
    from sklearn.model_selection import GridSearchCV

    cv = GridSearchCV(rfc,parameters,cv = 5)
    cv.fit(train_features,train_label.values.ravel())

    def display(results):
        print(f'Best parameters are: {results.best_params_}')
        print("\n")
        mean_score = results.cv_results_['mean_test_score']
        std_score = results.cv_results_['std_test_score']
        params = results.cv_results_['params']
        for mean,std,params in zip(mean_score,std_score,params):
            print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

    display(cv)

data = spark.read.csv(FILE_NAME,inferSchema=True, header=True)
data = iqr_capping(data.toPandas(),1.5,spark)

if FILTER_PLAYER_POSITION == "DEFENDER" :
    cols = ['overall', 'defending', 'movement_reactions', 'mentality_interceptions','defending_sliding_tackle','wage_eur'] #DEFENDER
else :   
    cols = ['overall' ,'dribbling', 'skill_dribbling' , 'skill_ball_control','movement_reactions','wage_eur'] #ATTACKER

data = data.filter(data.position == FILTER_PLAYER_POSITION)
data = data.select(cols)

(train, test) = data.randomSplit([0.8, 0.2])
assembler=VectorAssembler().setInputCols(cols).setOutputCol('features')
train_a = assembler.transform(train)
train_b = train_a.select("features",train_a.wage_eur.alias('label'))


X_train = train_a.select(cols).toPandas()
y_train = train_b.select("label").toPandas()



bestRFParameters(X_train, y_train, [21,22,23,24,25],[2, 4, 6, 8]) # Takes some time...


AnalysisException: ignored

# Display Results

In [None]:
def getResult (model,lib) : 
    evaluator = RegressionEvaluator()
    r2 = (evaluator.evaluate(model,{evaluator.metricName: "r2"}))
    mae =(evaluator.evaluate(model,{evaluator.metricName: "mae"}))
    mse = (evaluator.evaluate(model,{evaluator.metricName: "mse"}))
    rmse = (evaluator.evaluate(model,{evaluator.metricName: "rmse"}))
    return {"lib":lib,"r2":r2,"mae":mae,"mse":mse,"rmse":rmse}

# Importing Data

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

dataCleanupAndFromat(GDRIVE_FILE_URL)


In [None]:
data = spark.read.csv(FILE_NAME,inferSchema=True, header=True)
data = iqr_capping(data.toPandas(),1.5,spark)

useful_columns =['sofifa_id','short_name','club_name','age','nationality_name','overall','potential' ,'dribbling', 'attacking_crossing','skill_dribbling', 'skill_ball_control',
 'movement_reactions' ,'defending', 'defending_standing_tackle', 'defending_sliding_tackle','mentality_interceptions','position','wage_eur']
mapData = data.select(useful_columns)



# Heatmap Corrrelation for some of Input Features

In [None]:
import seaborn as sns
plt.figure(figsize=(15,8))
cor = mapData.toPandas().corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()


# Data Handling


In [None]:

kBestInputFeatures(5, FILTER_PLAYER_POSITION, data )


In [None]:
data = data.filter(data.position == FILTER_PLAYER_POSITION)

if FILTER_PLAYER_POSITION == "DEFENDER" :
    cols = ['overall', 'defending', 'movement_reactions', 'mentality_interceptions','defending_sliding_tackle','wage_eur'] #DEFENDER
else :   
    cols = ['overall' ,'dribbling', 'skill_dribbling' , 'skill_ball_control','movement_reactions','wage_eur'] #ATTACKER

data2 = data.select(cols)
print((data.count(), len(data.columns)))

In [None]:
(train, test) = data2.randomSplit([0.8, 0.2])
cols.remove('wage_eur')
assembler=VectorAssembler().setInputCols(cols).setOutputCol('features')
train_a = assembler.transform(train)
train_b = train_a.select("features",train_a.wage_eur.alias('label'))
# train_b.show(truncate=False)
test_a =  assembler.transform(test)
test_b = test_a.select('features', test_a.wage_eur.alias('label'))

results = []


# Decision Tree Regression

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(maxDepth=5)

model = dt.fit(train_b)
test_dt = model.transform(test_b)
# test_dt.show(truncate=False)

results.append(getResult(test_dt,'dt'))

list1= test_dt.select('label')
list2 = test_dt.select('prediction')
wage = [int(row.label) for row in list1.collect()]
prediction = [int(row.prediction) for row in list2.collect()]

plt.plot(wage)
plt.plot(prediction)

# Random Forest Regression

In [None]:

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(numTrees = 20, maxDepth= 5)
model = rf.fit(train_b)
test_rf = model.transform(test_b)
# test_rf.show(truncate=False)

results.append(getResult(test_rf,'rf'))

list1= test_rf.select('label')
list2 = test_rf.select('prediction')
wage = [int(row.label) for row in list1.collect()]
prediction = [int(row.prediction) for row in list2.collect()]

plt.plot(wage)
plt.plot(prediction)

# Isotonic Regression

In [None]:

from pyspark.ml.regression import IsotonicRegression
ir = IsotonicRegression()
test_a =  assembler.transform(test)
test_b = test_a.select('features', test_a.wage_eur.alias('label'))
model = ir.fit(train_b)
test_ir = model.transform(test_b)
# test_ir.show(truncate=False)
results.append(getResult(test_ir,'ir'))


list1= test_ir.select('label')
list2 = test_ir.select('prediction')
wage = [int(row.label) for row in list1.collect()]
prediction = [int(row.prediction) for row in list2.collect()]

plt.plot(wage)
plt.plot(prediction)


# KNN Regression

In [None]:
#KNN by Pandas
def knn_regression(train_a,train_b,test_a,test_b, cols):


  from sklearn.neighbors import KNeighborsRegressor
  from sklearn.model_selection import train_test_split
  from sklearn import metrics
  import numpy as np
  from matplotlib import pyplot as plt 


  X_train = train_a.select(cols).toPandas()
  y_train = train_b.select("label").toPandas()
  X_test = test_a.select(cols).toPandas()
  y_test = test_b.select("label").toPandas()

  knn_model = KNeighborsRegressor(n_neighbors = 10 , algorithm = 'brute')
  knn_model.fit(X_train, y_train)
  knn_pred_wage = knn_model.predict(X_test)

  r2 = metrics.r2_score(y_test, knn_pred_wage)
  mse = metrics.mean_squared_error(y_test, knn_pred_wage)
  mae = metrics.mean_absolute_error(y_test, knn_pred_wage)
  rmse = np.sqrt(mse)

  #Plotting Graph
  y_test = y_test['label'].values
  knn_pred_wage = knn_pred_wage.reshape(-1)




  plt.plot(y_test)
  plt.plot(knn_pred_wage)

  plt.show()

  return {"lib":'knn',"r2":float(r2),"mae":float(mae),"mse":float(mse),"rmse":float(rmse)}

results.append(knn_regression(train_a,train_b,test_a,test_b, cols))

# SVM

In [None]:
#SVM by Pandas
def smv_regression(train_a,train_b,test_a,test_b, cols):
  from sklearn.svm import SVR
  from sklearn.model_selection import train_test_split
  from sklearn import metrics
  import numpy as np
  from matplotlib import pyplot as plt 

  
  X_train = train_a.select(cols).toPandas()
  y_train = train_b.select("label").toPandas()
  X_test = test_a.select(cols).toPandas()
  y_test = test_b.select("label").toPandas()

  svm_model = SVR(kernel = 'rbf')

  svm_model.fit(X_train, y_train)
  svm_pred_wage = svm_model.predict(X_test)

  r2 = metrics.r2_score(y_test, svm_pred_wage)
  mse = metrics.mean_squared_error(y_test, svm_pred_wage)
  mae = metrics.mean_absolute_error(y_test, svm_pred_wage)
  rmse = np.sqrt(mse)

  #Plotting Graph
  y_test = y_test['label'].values
  svm_pred_wage = svm_pred_wage.reshape(-1)

  plt.plot(y_test)
  plt.plot(svm_pred_wage)

  plt.show()

  return {"lib":'svm',"r2":float(r2),"mae":float(mae),"mse":float(mse),"rmse":float(rmse)}

results.append(smv_regression(train_a,train_b,test_a,test_b, cols))

# OVERALL RESULTS

In [None]:

results_df = spark.createDataFrame(results)
print("\n\n Data set Year : ", "2022","Position : ", FILTER_PLAYER_POSITION)
results_df.show()

# Club Wise Best Overall Performer's Wage Comparison

In [None]:
def clubWisePerformance():
    import plotly.express as px
    club_names = ['Real Madrid CF','FC Barcelona','Manchester United','Juventus','Chelsea', 
                  'Paris Saint-Germain' ,'FC Bayern München', 'Arsenal', 'Liverpool', 'Manchester City']

    data_frame = spark.read.csv(FILE_NAME,inferSchema=True, header=True).toPandas()
    data_frame  = data_frame[data_frame.club_name.isin(club_names)]
    data_frame = data_frame[['short_name','club_name','overall','wage_eur']]
    data_frame = data_frame.sort_values(by=['overall','club_name'], ascending=False)
    data_frame= data_frame.reset_index()

    names = []
    club = []
    scores = []
    max_scores = data_frame.groupby(['club_name'], sort=False)['overall'].max()
    for ind in data_frame.index:
     if max_scores[data_frame['club_name'][ind]] == data_frame['overall'][ind] :
         names.append(data_frame['short_name'][ind])
         club. append(data_frame['club_name'][ind])
         scores.append(data_frame['wage_eur'][ind])
    
    fig = px.bar(scores, x=names, y=scores,color= club, title="Club Wise Best Overall Performer's Wage Comparison")
    fig.show()

    
clubWisePerformance()

# Selecting Top K players within Given Budget

In [None]:
def selectTopKPlayersWithinBudget(k,pos,nation,max_budget,spark):
    import plotly.express as px
    useful_columns =['sofifa_id','short_name','overall','age','wage_eur','potential','club_name','nationality_name']
    
    df = spark.read.csv(FILE_NAME,inferSchema=True, header=True).toPandas()
    if nation != "all":
        df = df[df['nationality_name'] == nation]
    query = "position =='"+pos+"' & wage_eur <= "+str(max_budget)
    topKPlayers =  df.query(query)[useful_columns]
    topKPlayers = topKPlayers.sort_values('overall', ascending=False).iloc[0:k]
    fig = px.bar(topKPlayers, x="short_name", y="wage_eur", color='nationality_name', title="TOP "+str(k)+" "+pos+'S Under '+str(max_budget)+" Euros")
    fig.show()      




country = "all" # all , Italy , Germany , Portugal , England , India ...
max_budget = 360000

selectTopKPlayersWithinBudget(10,"ATTACKER",country,max_budget,spark)

# Comparing given 2 players

In [None]:
def comparePlayers(id1,id2):
    import plotly.graph_objects as go
    from sklearn.preprocessing import MinMaxScaler

    df = spark.read.csv(FILE_NAME,inferSchema=True, header=True).toPandas()

    scaler = MinMaxScaler(feature_range=(1 , 100))
    features = df[["wage_eur"]]
    df[["wage_eur"]] = scaler.fit_transform(features.values)
    
    player1 = df[df["sofifa_id"]== int(id1)]
    player2 = df[df["sofifa_id"]== int(id2)]
   
    categories = ["overall","potential", "pace", "shooting" ,"dribbling" ,"defending" ,"physic",'passing','wage_eur']
    p1 = player1[categories].to_numpy()[0]
    p2 = player2[categories].to_numpy()[0]
    
    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(r= p1,theta=categories,fill='toself',
                                  name = player1['short_name'].values[0]))
    
    fig.add_trace(go.Scatterpolar(r= p2,theta=categories,fill='toself',
                                  name = player2['short_name'].values[0]))

    fig.update_layout(polar=dict(radialaxis=dict(visible=True,range=[1, 100])),showlegend = True,title='Player Comparison')

    fig.show()


comparePlayers(192985, 158023 ) #ids: 20801 , 158023 , 192985 , 261962 , 231747

In [None]:
spark.stop()