In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=0142e95fba4ea04b666d2b1108e6c8d68dedf85523366293c5ff22ebabba16ba
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

In [5]:
!mkdir data
!cp /content/drive/MyDrive/BigDataProject/players_22_new.csv /content/data
!cp /content/drive/MyDrive/BigDataProject/players_22.csv /content/data

In [31]:
spark = SparkSession.builder\
        .master("local")\
        .appName("FIFA22")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

In [32]:
df = spark.read.format("csv").load("data/players_22_new.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- wage_eur: double (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- international_reputation: integer (nullable = true)
 |-- overall: integer (nullable = true)
 |-- movement_reactions: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- mentality_composure: integer (nullable = true)
 |-- mentality_vision: integer (nullable = true)
 |-- power_shot_power: integer (nullable = true)
 |-- attacking_short_passing: integer (nullable = true)
 |-- skill_long_passing: integer (nullable = true)
 |-- skill_ball_control: integer (nullable = true)
 |-- skill_curve: integer (nullable = true)
 |-- skill_moves: integer (nullable = true)
 |-- attacking_volleys: integer (nullable = true)
 |-- club_name: integer (nullable = true)
 |-- league_name: integer (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_position: integer (nullable = true)
 |-- nationality_name: integer (nullable = true)
 |-- preferred_foot: integer (nullable = true)
 |-- b

In [33]:
df.show()

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+
|320

In [34]:
df.columns

['wage_eur',
 'value_eur',
 'international_reputation',
 'overall',
 'movement_reactions',
 'potential',
 'mentality_composure',
 'mentality_vision',
 'power_shot_power',
 'attacking_short_passing',
 'skill_long_passing',
 'skill_ball_control',
 'skill_curve',
 'skill_moves',
 'attacking_volleys',
 'club_name',
 'league_name',
 'league_level',
 'club_position',
 'nationality_name',
 'preferred_foot',
 'body_type']

In [35]:
train, test = df.randomSplit([0.8, 0.2])
train ,test


(DataFrame[wage_eur: double, value_eur: double, international_reputation: int, overall: int, movement_reactions: int, potential: int, mentality_composure: int, mentality_vision: int, power_shot_power: int, attacking_short_passing: int, skill_long_passing: int, skill_ball_control: int, skill_curve: int, skill_moves: int, attacking_volleys: int, club_name: int, league_name: int, league_level: double, club_position: int, nationality_name: int, preferred_foot: int, body_type: int],
 DataFrame[wage_eur: double, value_eur: double, international_reputation: int, overall: int, movement_reactions: int, potential: int, mentality_composure: int, mentality_vision: int, power_shot_power: int, attacking_short_passing: int, skill_long_passing: int, skill_ball_control: int, skill_curve: int, skill_moves: int, attacking_volleys: int, club_name: int, league_name: int, league_level: double, club_position: int, nationality_name: int, preferred_foot: int, body_type: int])

In [36]:
categorical_columns = ['club_name','league_name','club_position','nationality_name','preferred_foot','body_type','league_level']

features_list = df.columns[1:]

numerical_features_list = [entry for entry in features_list if entry not in categorical_columns]
print(f'numerical_features_list: {numerical_features_list}')

numerical_features_list: ['value_eur', 'international_reputation', 'overall', 'movement_reactions', 'potential', 'mentality_composure', 'mentality_vision', 'power_shot_power', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'skill_curve', 'skill_moves', 'attacking_volleys']


In [37]:
vector_assembler = VectorAssembler(inputCols=numerical_features_list,
                                             outputCol='numerical_feature_vector')

train = vector_assembler.transform(train)
test = vector_assembler.transform(test)

In [38]:
train.show(3)
train.select('numerical_feature_vector').take(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-----------

[Row(numerical_feature_vector=DenseVector([15000.0, 1.0, 49.0, 50.0, 49.0, 55.0, 56.0, 54.0, 52.0, 50.0, 50.0, 58.0, 3.0, 40.0])),
 Row(numerical_feature_vector=DenseVector([15000.0, 1.0, 55.0, 46.0, 55.0, 57.0, 14.0, 41.0, 23.0, 31.0, 17.0, 10.0, 1.0, 17.0])),
 Row(numerical_feature_vector=DenseVector([20000.0, 1.0, 57.0, 53.0, 57.0, 29.0, 32.0, 43.0, 40.0, 45.0, 19.0, 12.0, 1.0, 11.0]))]

In [40]:
scaler = StandardScaler(inputCol='numerical_feature_vector',
                        outputCol='scaled_feature_vector',
                        withStd=True, withMean=True)

scaler = scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

In [41]:
train.show(3)
train.select('scaled_feature_vector').take(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+--

[Row(scaled_feature_vector=DenseVector([-0.3709, -0.2508, -2.4333, -1.2587, -3.6333, -0.2335, 0.1589, -0.2771, -0.4704, -0.1999, -0.5055, 0.5985, 0.8451, -0.134])),
 Row(scaled_feature_vector=DenseVector([-0.3709, -0.2508, -1.5606, -1.7013, -2.6447, -0.0685, -2.925, -1.2606, -2.4789, -1.4682, -2.4927, -2.0497, -1.7626, -1.4411])),
 Row(scaled_feature_vector=DenseVector([-0.3703, -0.2508, -1.2697, -0.9267, -2.3152, -2.3785, -1.6033, -1.1093, -1.3015, -0.5336, -2.3723, -1.9393, -1.7626, -1.7821]))]

In [42]:
categorical_columns_onehot = [string+'_encoded' for string in categorical_columns]

one_hot_encoder = OneHotEncoder(inputCols=categorical_columns,
                                outputCols=categorical_columns_onehot)

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)

assembler = VectorAssembler(inputCols = categorical_columns_onehot, outputCol = 'categorical_onehot_feature')
train = assembler.transform(train)
test = assembler.transform(test)

train.show(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+-----------------+-------------------+---------------------+------------------------+----------------------+-----------------+--------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|club_name_encoded|league_name_encoded|club_position_encoded|nationality_name_encoded|preferred_f

In [44]:
assembler = VectorAssembler(inputCols=['categorical_onehot_feature', 'scaled_feature_vector'],
                            outputCol='final_feature_vector')

train = assembler.transform(train)
test = assembler.transform(test)

In [45]:
train.describe().show()
test.describe().show()

+-------+-----------------+------------------+------------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+-----------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+
|summary|         wage_eur|         value_eur|international_reputation|          overall|movement_reactions|        potential|mentality_composure|  mentality_vision|  power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|       skill_curve|       skill_moves|attacking_volleys|         club_name|       league_name|      league_level|     club_position| nationality_name|    preferred_foot|         body_type|
+-------+-----------------+------------------+------------------------+-----------------+------------------+-----------------+----

In [46]:
lr = LinearRegression(featuresCol='final_feature_vector',
                      labelCol='wage_eur')

lr = lr.fit(train)
pred_train_df = lr.transform(train).withColumnRenamed('prediction',
                                                      'predicted_wage')

pred_train_df.show(5)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+-----------------+-------------------+---------------------+------------------------+----------------------+-----------------+--------------------+--------------------------+--------------------+-------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|club_name_encoded|league_nam

In [47]:
pred_test_df = lr.transform(train).withColumnRenamed('prediction', 'predicted_wage')
pred_test_df.show(5)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+-----------------+-------------------+---------------------+------------------------+----------------------+-----------------+--------------------+--------------------------+--------------------+-------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|club_name_encoded|league_nam

In [48]:
pred_test_pd_df = pred_test_df.toPandas()

pred_test_pd_df.head(2)

Unnamed: 0,wage_eur,value_eur,international_reputation,overall,movement_reactions,potential,mentality_composure,mentality_vision,power_shot_power,attacking_short_passing,...,club_name_encoded,league_name_encoded,club_position_encoded,nationality_name_encoded,preferred_foot_encoded,body_type_encoded,league_level_encoded,categorical_onehot_feature,final_feature_vector,predicted_wage
0,500.0,15000.0,1,49,50,49,55,56,54,52,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1063.645918
1,500.0,15000.0,1,55,46,55,57,14,41,23,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",(0.0),"(0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-747.471626


In [49]:
predictions_actuals = pred_test_df[['predicted_wage','wage_eur']]
predictions_actuals_rdd = predictions_actuals.rdd
predictions_actuals_rdd = predictions_actuals_rdd.map(tuple)
predictions_actuals_rdd.take(2)

[(-1063.645918484941, 500.0), (-747.4716257129257, 500.0)]

In [50]:
metrics = RegressionMetrics(predictions_actuals_rdd)

print(f'''Mean Squared Error: {metrics.meanSquaredError}
          Root Mean Squared Error: {metrics.rootMeanSquaredError}
          Mean Absolute Error:{metrics.meanAbsoluteError}
          R**2: {metrics.r2}
      ''')



Mean Squared Error: 47274300.805687815
          Root Mean Squared Error: 6875.630938734846
          Mean Absolute Error:3380.6842863376787
          R**2: 0.8782621230575269
      


R-squared values range from 0 to 1. A value of 0 indicates that the response variable cannot be explained by the predictor variable at all, while a value of 1 indicates that the response variable can be perfectly explained without error by the predictor variable