In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=c6ecacdfdf4f7e4cbed5b1048d77f6cf55d1d2e2c5f09f9c19c576289b0c41c4
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

In [5]:
!mkdir data
!cp /content/drive/MyDrive/BigDataProject/players_22_new.csv /content/data
!cp /content/drive/MyDrive/BigDataProject/players_22.csv /content/data

In [6]:
spark = SparkSession.builder\
        .master("local")\
        .appName("FIFA22")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

In [7]:
df = spark.read.format("csv").load("data/players_22_new.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- wage_eur: double (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- international_reputation: integer (nullable = true)
 |-- overall: integer (nullable = true)
 |-- movement_reactions: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- mentality_composure: integer (nullable = true)
 |-- mentality_vision: integer (nullable = true)
 |-- power_shot_power: integer (nullable = true)
 |-- attacking_short_passing: integer (nullable = true)
 |-- skill_long_passing: integer (nullable = true)
 |-- skill_ball_control: integer (nullable = true)
 |-- skill_curve: integer (nullable = true)
 |-- skill_moves: integer (nullable = true)
 |-- attacking_volleys: integer (nullable = true)
 |-- club_name: integer (nullable = true)
 |-- league_name: integer (nullable = true)
 |-- league_level: double (nullable = true)
 |-- club_position: integer (nullable = true)
 |-- nationality_name: integer (nullable = true)
 |-- preferred_foot: integer (nullable = true)
 |-- b

In [8]:
df.show()

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+
|320

In [9]:
df.columns

['wage_eur',
 'value_eur',
 'international_reputation',
 'overall',
 'movement_reactions',
 'potential',
 'mentality_composure',
 'mentality_vision',
 'power_shot_power',
 'attacking_short_passing',
 'skill_long_passing',
 'skill_ball_control',
 'skill_curve',
 'skill_moves',
 'attacking_volleys',
 'club_name',
 'league_name',
 'league_level',
 'club_position',
 'nationality_name',
 'preferred_foot',
 'body_type']

In [10]:
train, test = df.randomSplit([0.8, 0.2])
train ,test


(DataFrame[wage_eur: double, value_eur: double, international_reputation: int, overall: int, movement_reactions: int, potential: int, mentality_composure: int, mentality_vision: int, power_shot_power: int, attacking_short_passing: int, skill_long_passing: int, skill_ball_control: int, skill_curve: int, skill_moves: int, attacking_volleys: int, club_name: int, league_name: int, league_level: double, club_position: int, nationality_name: int, preferred_foot: int, body_type: int],
 DataFrame[wage_eur: double, value_eur: double, international_reputation: int, overall: int, movement_reactions: int, potential: int, mentality_composure: int, mentality_vision: int, power_shot_power: int, attacking_short_passing: int, skill_long_passing: int, skill_ball_control: int, skill_curve: int, skill_moves: int, attacking_volleys: int, club_name: int, league_name: int, league_level: double, club_position: int, nationality_name: int, preferred_foot: int, body_type: int])

In [11]:
categorical_columns = ['club_name','league_name','club_position','nationality_name','preferred_foot','body_type','league_level']

features_list = df.columns[1:]

numerical_features_list = [entry for entry in features_list if entry not in categorical_columns]
print(f'numerical_features_list: {numerical_features_list}')

numerical_features_list: ['value_eur', 'international_reputation', 'overall', 'movement_reactions', 'potential', 'mentality_composure', 'mentality_vision', 'power_shot_power', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'skill_curve', 'skill_moves', 'attacking_volleys']


In [12]:
vector_assembler = VectorAssembler(inputCols=numerical_features_list,
                                             outputCol='numerical_feature_vector')

train = vector_assembler.transform(train)
test = vector_assembler.transform(test)

In [13]:
train.show(3)
train.select('numerical_feature_vector').take(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-----------

[Row(numerical_feature_vector=DenseVector([15000.0, 1.0, 49.0, 50.0, 49.0, 55.0, 56.0, 54.0, 52.0, 50.0, 50.0, 58.0, 3.0, 40.0])),
 Row(numerical_feature_vector=DenseVector([15000.0, 1.0, 55.0, 46.0, 55.0, 57.0, 14.0, 41.0, 23.0, 31.0, 17.0, 10.0, 1.0, 17.0])),
 Row(numerical_feature_vector=DenseVector([20000.0, 1.0, 57.0, 53.0, 57.0, 29.0, 32.0, 43.0, 40.0, 45.0, 19.0, 12.0, 1.0, 11.0]))]

In [14]:
scaler = StandardScaler(inputCol='numerical_feature_vector',
                        outputCol='scaled_feature_vector',
                        withStd=True, withMean=True)

scaler = scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

In [15]:
train.show(3)
train.select('scaled_feature_vector').take(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|
+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+--

[Row(scaled_feature_vector=DenseVector([-0.3855, -0.2553, -2.4323, -1.2658, -3.635, -0.2389, 0.1483, -0.2901, -0.4752, -0.2028, -0.5093, 0.5874, 0.8405, -0.1384])),
 Row(scaled_feature_vector=DenseVector([-0.3855, -0.2553, -1.5619, -1.7077, -2.6496, -0.0743, -2.9296, -1.2767, -2.4763, -1.466, -2.487, -2.0465, -1.7579, -1.4417])),
 Row(scaled_feature_vector=DenseVector([-0.3849, -0.2553, -1.2718, -0.9343, -2.3212, -2.3784, -1.6105, -1.125, -1.3032, -0.5352, -2.3672, -1.9367, -1.7579, -1.7817]))]

In [16]:
categorical_columns_onehot = [string+'_encoded' for string in categorical_columns]

one_hot_encoder = OneHotEncoder(inputCols=categorical_columns,
                                outputCols=categorical_columns_onehot)

one_hot_encoder = one_hot_encoder.fit(train)

train = one_hot_encoder.transform(train)
test = one_hot_encoder.transform(test)

assembler = VectorAssembler(inputCols = categorical_columns_onehot, outputCol = 'categorical_onehot_feature')
train = assembler.transform(train)
test = assembler.transform(test)

train.show(3)

+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+------------------------+---------------------+-----------------+-------------------+---------------------+------------------------+----------------------+-----------------+--------------------+--------------------------+
|wage_eur|value_eur|international_reputation|overall|movement_reactions|potential|mentality_composure|mentality_vision|power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|skill_curve|skill_moves|attacking_volleys|club_name|league_name|league_level|club_position|nationality_name|preferred_foot|body_type|numerical_feature_vector|scaled_feature_vector|club_name_encoded|league_name_encoded|club_position_encoded|nationali

In [17]:
assembler = VectorAssembler(inputCols=['categorical_onehot_feature', 'scaled_feature_vector'],
                            outputCol='final_feature_vector')

train = assembler.transform(train)
test = assembler.transform(test)

In [18]:
train.describe().show()
test.describe().show()

+-------+------------------+-----------------+------------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+-----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|          wage_eur|        value_eur|international_reputation|          overall|movement_reactions|        potential|mentality_composure|  mentality_vision|  power_shot_power|attacking_short_passing|skill_long_passing|skill_ball_control|       skill_curve|       skill_moves| attacking_volleys|         club_name|       league_name|      league_level|     club_position|  nationality_name|    preferred_foot|         body_type|
+-------+------------------+-----------------+------------------------+-----------------+------------------+-----------------+

R-squared values range from 0 to 1. A value of 0 indicates that the response variable cannot be explained by the predictor variable at all, while a value of 1 indicates that the response variable can be perfectly explained without error by the predictor variable

In [25]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol='final_feature_vector',
                      labelCol='wage_eur')
model = dt.fit(train)


In [27]:
test_dt = model.transform(test)
test_dt.show(truncate=False)


+--------+---------+------------------------+-------+------------------+---------+-------------------+----------------+----------------+-----------------------+------------------+------------------+-----------+-----------+-----------------+---------+-----------+------------+-------------+----------------+--------------+---------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+-------------------+---------------------+------------------------+----------------------+-----------------+--------------------+-----------------------------------------------------------------+---------------------------------------------------------------------------------------------------

In [33]:
pred_test_pd_df = test_dt.toPandas()

pred_test_pd_df.head(2)
pred_test_pd_df.prediction.min()

1426.6808964781217

In [29]:
predictions_actuals = test_dt[['prediction','wage_eur']]
predictions_actuals_rdd = predictions_actuals.rdd
predictions_actuals_rdd = predictions_actuals_rdd.map(tuple)
predictions_actuals_rdd.take(2)

[(1426.6808964781217, 500.0), (1426.6808964781217, 500.0)]

In [30]:
metrics = RegressionMetrics(predictions_actuals_rdd)

print(f'''Mean Squared Error: {metrics.meanSquaredError}
          Root Mean Squared Error: {metrics.rootMeanSquaredError}
          Mean Absolute Error:{metrics.meanAbsoluteError}
          R**2: {metrics.r2}
      ''')



Mean Squared Error: 87134811.75874084
          Root Mean Squared Error: 9334.602924535186
          Mean Absolute Error:4068.312133041912
          R**2: 0.7946674775591065
      
