# Install libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib
import pandas_profiling as pf

from io import BytesIO
from pandas.io import sql
from sqlalchemy import create_engine
from mysql import connector
from google.cloud import storage
from google.cloud import bigquery

%matplotlib inline
import matplotlib.pyplot as plt
#plt.switch_backend('agg')

pd.set_option('display.max_columns', None)

# Configure connection with Cloud SQL

In [2]:
user='ea-developer'
host='35.205.32.16'
port='3306'
db='ea_datalake'
database_connection = create_engine('mysql+mysqlconnector://{0}:@{1}:{2}/{3}'.format(user, host, port, db))

# Configure connection with Bigquery

In [3]:
bigquery_client = bigquery.Client()

Create a new dataset associated to the project

In [4]:
dataset_id = 'shining-bearing-255613.ea'
dataset = bigquery.Dataset(dataset_id)
dataset.location = "EU"
dataset = bigquery_client.create_dataset(dataset)

# Configure connection with Cloud Storage

In [5]:
# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = 'ea-datalake-dev'

In [6]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))
    
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()

    for blob in blobs:
        print(blob.name)

# Data Quality

Extract data aobut FIFA 18 players, previously loaded into our Cloud SQL database

In [7]:
df_players = pd.read_sql('SELECT * FROM ea_complete_dataset', con=database_connection)

View some descriptive information about our dataframe

In [8]:
print(df_players.shape)

(17981, 74)


In [9]:
print(df_players.columns)

Index(['Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential',
       'Club', 'Club Logo', 'Value', 'Wage', 'Special', 'Acceleration',
       'Aggression', 'Agility', 'Balance', 'Ball control', 'Composure',
       'Crossing', 'Curve', 'Dribbling', 'Finishing', 'Free kick accuracy',
       'GK diving', 'GK handling', 'GK kicking', 'GK positioning',
       'GK reflexes', 'Heading accuracy', 'Interceptions', 'Jumping',
       'Long passing', 'Long shots', 'Marking', 'Penalties', 'Positioning',
       'Reactions', 'Short passing', 'Shot power', 'Sliding tackle',
       'Sprint speed', 'Stamina', 'Standing tackle', 'Strength', 'Vision',
       'Volleys', 'CAM', 'CB', 'CDM', 'CF', 'CM', 'ID', 'LAM', 'LB', 'LCB',
       'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB', 'Preferred Positions',
       'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM', 'RS', 'RW', 'RWB', 'ST'],
      dtype='object')


In [10]:
df_players.head()

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Acceleration,Aggression,Agility,Balance,Ball control,Composure,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,GK diving,GK handling,GK kicking,GK positioning,GK reflexes,Heading accuracy,Interceptions,Jumping,Long passing,Long shots,Marking,Penalties,Positioning,Reactions,Short passing,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,Preferred Positions,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,€95.5M,€565K,2228,89,63,89,63,93,95,85,81,91,94,76,7,11,15,14,11,88,29,95,77,92,22,85,95,96,83,94,23,91,92,31,80,85,88,89.0,53.0,62.0,91.0,82.0,20801,89.0,61.0,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,ST LW,89.0,61.0,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,92.0
1,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,€105M,€565K,2154,92,48,90,95,95,96,77,89,97,95,90,6,11,15,14,8,71,22,68,87,88,13,74,93,95,88,85,26,87,73,28,59,90,85,92.0,45.0,59.0,92.0,84.0,158023,92.0,57.0,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,RW,92.0,57.0,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,88.0
2,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,€123M,€280K,2100,94,56,96,82,95,92,75,81,96,89,84,9,9,15,15,11,62,36,61,75,77,21,81,90,88,81,80,33,90,78,24,53,80,83,88.0,46.0,59.0,88.0,79.0,190871,88.0,59.0,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,LW,88.0,59.0,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,84.0
3,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,€97M,€510K,2291,88,78,86,60,91,83,77,86,86,94,84,27,25,31,33,37,77,41,69,64,86,30,85,92,93,83,87,38,77,89,45,80,84,88,87.0,58.0,65.0,88.0,80.0,176580,87.0,64.0,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,ST,87.0,64.0,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,88.0
4,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,Germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,€61M,€230K,1493,58,29,52,35,48,70,15,14,30,13,11,91,90,95,91,89,25,30,78,59,16,10,47,12,85,55,25,11,61,44,10,83,70,11,,,,,,167495,,,,,,,,,,,GK,,,,,,,,,,,


In [11]:
df_players.describe()

Unnamed: 0,Age,Overall,Potential,Special,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
count,17981.0,17981.0,17981.0,17981.0,15952.0,15952.0,15952.0,15952.0,15952.0,17981.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0,15952.0
mean,25.144541,66.247984,71.190813,1594.0951,59.251755,55.550464,56.865283,59.030028,58.506833,207658.710138,59.251755,56.979689,55.550464,58.506833,56.865283,59.030028,60.057736,58.20405,59.359265,57.698721,59.251755,56.979689,55.550464,58.506833,56.865283,59.030028,60.057736,58.20405,59.359265,57.698721,58.20405
std,4.614272,6.987965,6.102199,272.151435,9.880164,12.192579,10.310178,9.926988,8.88804,32291.667313,9.880164,9.791627,12.192579,8.88804,10.310178,9.926988,9.34918,9.181392,9.978084,9.142825,9.880164,9.791627,12.192579,8.88804,10.310178,9.926988,9.34918,9.181392,9.978084,9.142825,9.181392
min,16.0,46.0,46.0,728.0,27.0,25.0,26.0,27.0,30.0,16.0,27.0,30.0,25.0,30.0,26.0,27.0,28.0,31.0,26.0,31.0,27.0,30.0,25.0,30.0,26.0,27.0,28.0,31.0,26.0,31.0,31.0
25%,21.0,62.0,67.0,1449.0,53.0,45.0,49.0,53.0,53.0,192622.0,53.0,50.0,45.0,53.0,49.0,53.0,54.0,52.0,53.0,51.0,53.0,50.0,45.0,53.0,49.0,53.0,54.0,52.0,53.0,51.0,52.0
50%,25.0,66.0,71.0,1633.0,60.0,57.0,58.0,60.0,59.0,214057.0,60.0,58.0,57.0,59.0,58.0,60.0,61.0,59.0,60.0,58.0,60.0,58.0,57.0,59.0,58.0,60.0,61.0,59.0,60.0,58.0,59.0
75%,28.0,71.0,75.0,1786.0,66.0,65.0,65.0,66.0,65.0,231448.0,66.0,64.0,65.0,65.0,65.0,66.0,67.0,65.0,66.0,64.0,66.0,64.0,65.0,65.0,65.0,66.0,67.0,65.0,66.0,64.0,65.0
max,47.0,94.0,94.0,2291.0,92.0,87.0,85.0,92.0,87.0,241219.0,92.0,84.0,87.0,87.0,85.0,92.0,90.0,92.0,91.0,84.0,92.0,84.0,87.0,87.0,85.0,92.0,90.0,92.0,91.0,84.0,92.0


### Summary table with detailed information about each field/column

* Data type

* Count of missing values

* Count of present values

* Number of unique values

* Minimum value

* Maximum value

* Mean value

* Median value

In [12]:
df_summary_dicc  = pd.DataFrame(columns=list(df_players.keys())).transpose()
df_summary_types = pd.DataFrame(df_players.dtypes).rename(columns = {0:'dtypes'})
df_summary_count = pd.DataFrame(df_players.count()).rename(columns = {0:'present_values'})
df_summary_nulls = pd.DataFrame(df_players.isnull().sum()).rename(columns = {0:'null_values'})
df_summary_unique= pd.DataFrame(df_players.nunique()).rename(columns = {0:'unique_values'})
df_summary_min   = pd.DataFrame(df_players.min()).rename(columns = {0:'min_value'})
df_summary_max   = pd.DataFrame(df_players.max()).rename(columns = {0:'max_value'})
df_summary_mean  = pd.DataFrame(df_players.mean()).rename(columns = {0:'mean_value'})
df_summary_median= pd.DataFrame(df_players.median()).rename(columns = {0:'median_value'})
df_summary = df_summary_dicc.join(df_summary_types).join(df_summary_count).join(df_summary_nulls)
df_summary['total_values'] = df_summary['present_values'] + df_summary['null_values']
df_summary = df_summary.join(df_summary_unique).join(df_summary_min).join(df_summary_max)
df_summary = df_summary.join(df_summary_mean).join(df_summary_median)
df_summary

Unnamed: 0,dtypes,present_values,null_values,total_values,unique_values,min_value,max_value,mean_value,median_value
Name,object,17981,0,17981,16975,A. Abbas,Óscar Whalley,,
Age,int64,17981,0,17981,29,16,47,25.144541,25.0
Photo,object,17981,0,17981,17929,https://cdn.sofifa.org/48/18/players/101317.png,https://cdn.sofifa.org/48/18/players/9833.png,,
Nationality,object,17981,0,17981,165,Afghanistan,Zimbabwe,,
Flag,object,17981,0,17981,165,https://cdn.sofifa.org/flags/1.png,https://cdn.sofifa.org/flags/99.png,,
Overall,int64,17981,0,17981,49,46,94,66.247984,66.0
Potential,int64,17981,0,17981,48,46,94,71.190813,71.0
Club,object,17733,248,17981,647,,,,
Club Logo,object,17981,0,17981,679,https://cdn.sofifa.org/24/18/teams/1.png,https://cdn.sofifa.org/flags/9.png,,
Value,object,17981,0,17981,207,€0,€9M,,


### Missing values

Registers with some fields without information

In [13]:
df_players[df_players.isnull().any(axis=1)]

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Acceleration,Aggression,Agility,Balance,Ball control,Composure,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,GK diving,GK handling,GK kicking,GK positioning,GK reflexes,Heading accuracy,Interceptions,Jumping,Long passing,Long shots,Marking,Penalties,Positioning,Reactions,Short passing,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,Preferred Positions,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
4,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,Germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,€61M,€230K,1493,58,29,52,35,48,70,15,14,30,13,11,91,90,95,91,89,25,30,78,59,16,10,47,12,85,55,25,11,61,44,10,83,70,11,,,,,,167495,,,,,,,,,,,GK,,,,,,,,,,,
6,De Gea,26,https://cdn.sofifa.org/48/18/players/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,90,92,Manchester United,https://cdn.sofifa.org/24/18/teams/11.png,€64.5M,€215K,1458,57,38,60,43,42,64,17,21,18,13,19,90,85,87,86,90,21,30,67,51,12,13,40,12,88,50,31,13,58,40,21,64,68,13,,,,,,193080,,,,,,,,,,,GK,,,,,,,,,,,
12,T. Courtois,25,https://cdn.sofifa.org/48/18/players/192119.png,Belgium,https://cdn.sofifa.org/flags/7.png,89,92,Chelsea,https://cdn.sofifa.org/24/18/teams/5.png,€59M,€190K,1282,46,23,61,45,23,52,14,19,13,14,11,85,91,69,86,88,13,15,68,31,17,11,27,13,81,32,36,16,52,38,18,70,44,12,,,,,,192119,,,,,,,,,,,GK,,,,,,,,,,,
18,G. Buffon,39,https://cdn.sofifa.org/48/18/players/1179.png,Italy,https://cdn.sofifa.org/flags/27.png,89,89,Juventus,https://cdn.sofifa.org/24/18/teams/45.png,€4.5M,€110K,1335,49,38,55,49,28,70,13,20,26,15,13,89,88,74,90,84,13,28,75,35,13,10,22,12,80,37,39,11,43,39,11,69,50,17,,,,,,1179,,,,,,,,,,,GK,,,,,,,,,,,
20,J. Oblak,24,https://cdn.sofifa.org/48/18/players/200389.png,Slovenia,https://cdn.sofifa.org/flags/44.png,88,93,Atlético Madrid,https://cdn.sofifa.org/24/18/teams/240.png,€57M,€82K,1290,43,34,67,49,16,55,13,13,12,11,14,84,90,77,87,84,15,19,76,26,12,14,11,11,84,29,22,18,60,41,12,78,55,13,,,,,,200389,,,,,,,,,,,GK,,,,,,,,,,,
29,H. Lloris,30,https://cdn.sofifa.org/48/18/players/167948.png,France,https://cdn.sofifa.org/flags/18.png,88,88,Tottenham Hotspur,https://cdn.sofifa.org/24/18/teams/18.png,€38M,€165K,1318,65,31,55,54,34,61,13,11,10,10,10,88,86,68,82,90,10,27,74,50,14,12,40,10,85,50,23,18,63,41,10,43,30,11,,,,,,167948,,,,,,,,,,,GK,,,,,,,,,,,
43,S. Handanovič,32,https://cdn.sofifa.org/48/18/players/162835.png,Slovenia,https://cdn.sofifa.org/flags/44.png,87,87,Inter,https://cdn.sofifa.org/24/18/teams/44.png,€29M,€91K,1264,54,25,42,36,24,62,12,12,18,10,14,87,86,69,87,87,10,22,78,34,19,17,23,12,83,36,22,13,57,41,10,71,41,12,,,,,,162835,,,,,,,,,,,GK,,,,,,,,,,,
68,P. Čech,35,https://cdn.sofifa.org/48/18/players/48940.png,Czech Republic,https://cdn.sofifa.org/flags/12.png,86,86,Arsenal,https://cdn.sofifa.org/24/18/teams/1.png,€10.5M,€92K,1206,40,17,49,34,22,70,19,13,12,12,19,82,87,76,83,81,19,23,51,33,11,11,23,13,84,35,21,12,44,32,13,65,53,17,,,,,,48940,,,,,,,,,,,GK,,,,,,,,,,,
74,K. Navas,30,https://cdn.sofifa.org/48/18/players/193041.png,Costa Rica,https://cdn.sofifa.org/flags/72.png,85,85,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,€24.5M,€165K,1301,54,32,60,61,19,49,11,11,16,15,15,87,82,72,80,85,11,20,74,37,13,12,25,16,82,30,21,14,53,39,14,75,54,11,,,,,,193041,,,,,,,,,,,GK,,,,,,,,,,,
76,D. Subašić,32,https://cdn.sofifa.org/48/18/players/192593.png,Croatia,https://cdn.sofifa.org/flags/10.png,85,85,AS Monaco,https://cdn.sofifa.org/24/18/teams/69.png,€22M,€46K,1305,51,31,42,37,19,65,11,24,11,10,66,84,79,79,85,87,13,20,75,26,14,15,23,15,82,24,24,13,54,32,14,80,52,13,,,,,,192593,,,,,,,,,,,GK,,,,,,,,,,,


Prefered possitions when any field is null (excluding Club)

In [14]:
df_players[df_players.Club.notnull() & df_players.isnull().any(axis=1)]['Preferred Positions'].unique()

array(['GK '], dtype=object)

Players without information about CLUB

In [15]:
df_players[df_players.Club.isnull()]

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Acceleration,Aggression,Agility,Balance,Ball control,Composure,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,GK diving,GK handling,GK kicking,GK positioning,GK reflexes,Heading accuracy,Interceptions,Jumping,Long passing,Long shots,Marking,Penalties,Positioning,Reactions,Short passing,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,Preferred Positions,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
162,Oscar,25,https://cdn.sofifa.org/48/18/players/188152.png,Brazil,https://cdn.sofifa.org/flags/54.png,83,86,,https://cdn.sofifa.org/flags/54.png,€0,€0,1961,75,31,86,80,84,83,70,77,81,75,77,12,10,15,12,12,54,34,66,78,77,37,68,81,81,83,76,47,74,76,50,36,83,63,81.0,49.0,61.0,80.0,77.0,188152,81.0,61.0,49.0,77.0,61.0,80.0,79.0,74.0,79.0,64.0,CAM,81.0,61.0,49.0,77.0,61.0,80.0,79.0,74.0,79.0,64.0,74.0
167,Adrien Silva,28,https://cdn.sofifa.org/48/18/players/184826.png,Portugal,https://cdn.sofifa.org/flags/38.png,83,83,,https://cdn.sofifa.org/flags/38.png,€0,€0,2174,64,79,73,67,85,82,78,79,82,71,75,15,6,8,10,7,61,84,56,84,80,80,85,75,77,83,83,73,65,89,78,62,85,75,80.0,75.0,81.0,78.0,82.0,184826,80.0,78.0,75.0,82.0,81.0,78.0,79.0,74.0,78.0,79.0,CDM CM,80.0,78.0,75.0,82.0,81.0,78.0,79.0,74.0,78.0,79.0,74.0
274,A. Witsel,28,https://cdn.sofifa.org/48/18/players/177413.png,Belgium,https://cdn.sofifa.org/flags/7.png,82,82,,https://cdn.sofifa.org/flags/7.png,€0,€0,2143,67,78,82,60,85,86,69,69,83,71,68,5,7,7,10,7,77,80,72,78,74,69,81,75,81,83,78,71,74,85,73,79,78,67,79.0,76.0,79.0,78.0,80.0,177413,79.0,76.0,76.0,80.0,79.0,78.0,78.0,77.0,78.0,77.0,CAM CDM CM,79.0,76.0,76.0,80.0,79.0,78.0,78.0,77.0,78.0,77.0,77.0
472,M. Berg,30,https://cdn.sofifa.org/48/18/players/176733.png,Sweden,https://cdn.sofifa.org/flags/46.png,80,80,,https://cdn.sofifa.org/flags/46.png,€0,€0,1841,74,61,71,71,77,75,56,66,72,82,47,10,7,12,8,14,77,28,74,51,74,12,70,83,75,72,79,25,75,69,33,76,66,74,73.0,47.0,51.0,76.0,66.0,176733,73.0,50.0,47.0,66.0,51.0,76.0,71.0,78.0,73.0,53.0,CF ST,73.0,50.0,47.0,66.0,51.0,76.0,71.0,78.0,73.0,53.0,78.0
488,Renato Augusto,29,https://cdn.sofifa.org/48/18/players/169195.png,Brazil,https://cdn.sofifa.org/flags/54.png,80,80,,https://cdn.sofifa.org/flags/54.png,€0,€0,2069,66,53,66,66,84,68,79,80,85,65,75,6,11,16,15,8,57,70,49,81,81,67,68,75,83,81,81,59,67,71,68,78,81,77,79.0,68.0,74.0,78.0,79.0,169195,79.0,71.0,68.0,79.0,74.0,78.0,78.0,74.0,78.0,73.0,LM CDM CAM CM,79.0,71.0,68.0,79.0,74.0,78.0,78.0,74.0,78.0,73.0,74.0
658,Gervinho,30,https://cdn.sofifa.org/48/18/players/170733.png,Ivory Coast,https://cdn.sofifa.org/flags/108.png,79,79,,https://cdn.sofifa.org/flags/108.png,€0,€0,1871,91,51,88,76,78,71,75,69,83,72,43,14,5,12,10,13,63,39,65,67,61,13,63,80,75,71,69,15,92,90,29,55,74,70,76.0,43.0,54.0,77.0,71.0,170733,76.0,55.0,43.0,71.0,54.0,77.0,79.0,74.0,78.0,59.0,RW LW,76.0,55.0,43.0,71.0,54.0,77.0,79.0,74.0,78.0,59.0,74.0
925,B. Moukandjo,28,https://cdn.sofifa.org/48/18/players/185090.png,Cameroon,https://cdn.sofifa.org/flags/103.png,77,77,,https://cdn.sofifa.org/flags/103.png,€0,€0,1904,90,51,83,70,75,77,72,71,79,80,71,14,14,8,12,10,68,32,71,60,65,25,79,77,76,71,78,19,91,70,20,65,65,72,74.0,44.0,51.0,76.0,66.0,185090,74.0,53.0,44.0,66.0,51.0,76.0,76.0,76.0,77.0,56.0,RM ST,74.0,53.0,44.0,66.0,51.0,76.0,76.0,76.0,77.0,56.0,76.0
944,B. Dočkal,28,https://cdn.sofifa.org/48/18/players/181271.png,Czech Republic,https://cdn.sofifa.org/flags/12.png,77,77,,https://cdn.sofifa.org/flags/12.png,€0,€0,1964,68,59,67,71,79,75,82,82,77,69,80,10,9,14,11,8,57,54,64,77,78,38,75,72,74,79,78,38,66,66,52,57,81,72,76.0,55.0,63.0,75.0,74.0,181271,76.0,61.0,55.0,74.0,63.0,75.0,75.0,71.0,75.0,64.0,CM CAM,76.0,61.0,55.0,74.0,63.0,75.0,75.0,71.0,75.0,64.0,71.0
1019,Gil,30,https://cdn.sofifa.org/48/18/players/193869.png,Brazil,https://cdn.sofifa.org/flags/54.png,77,77,,https://cdn.sofifa.org/flags/54.png,€0,€0,1701,53,81,41,40,62,75,47,39,51,28,34,9,11,6,11,14,75,78,60,69,41,78,42,50,74,70,69,76,67,71,77,86,68,23,58.0,76.0,74.0,57.0,65.0,193869,58.0,70.0,76.0,65.0,74.0,57.0,58.0,57.0,55.0,68.0,CB,58.0,70.0,76.0,65.0,74.0,57.0,58.0,57.0,55.0,68.0,57.0
1273,C. Riveros,34,https://cdn.sofifa.org/48/18/players/174381.png,Paraguay,https://cdn.sofifa.org/flags/58.png,76,76,,https://cdn.sofifa.org/flags/58.png,€0,€0,1999,60,72,79,70,76,80,74,67,69,69,68,9,13,6,16,15,70,65,65,77,71,63,71,76,72,81,71,62,53,75,61,62,77,64,74.0,66.0,71.0,72.0,75.0,174381,74.0,67.0,66.0,75.0,71.0,72.0,72.0,70.0,72.0,69.0,CDM CM,74.0,67.0,66.0,75.0,71.0,72.0,72.0,70.0,72.0,69.0,70.0


Estos jugadores adicionalmente tienen un valor de 0€

### Duplicates

Theere are some players with the same name:

* Total values:  17.981
* Unique values: 16.975

Find duplicate rows in our data, based on all column values

In [16]:
df_players[df_players.duplicated(keep=False)].sort_values(by=['Name'])

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Acceleration,Aggression,Agility,Balance,Ball control,Composure,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,GK diving,GK handling,GK kicking,GK positioning,GK reflexes,Heading accuracy,Interceptions,Jumping,Long passing,Long shots,Marking,Penalties,Positioning,Reactions,Short passing,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,Preferred Positions,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
775,A. Mandi,25,https://cdn.sofifa.org/48/18/players/201143.png,Algeria,https://cdn.sofifa.org/flags/97.png,78,83,Real Betis Balompié,https://cdn.sofifa.org/24/18/teams/449.png,€11.5M,€23K,1938,66,69,78,64,73,74,78,67,64,41,49,15,16,9,10,14,83,80,82,73,49,75,35,67,76,76,52,84,69,70,76,74,68,36,68.0,77.0,75.0,65.0,71.0,201143,68.0,76.0,77.0,71.0,75.0,65.0,69.0,63.0,67.0,75.0,CB,68.0,76.0,77.0,71.0,75.0,65.0,69.0,63.0,67.0,75.0,63.0
862,A. Mandi,25,https://cdn.sofifa.org/48/18/players/201143.png,Algeria,https://cdn.sofifa.org/flags/97.png,78,83,Real Betis Balompié,https://cdn.sofifa.org/24/18/teams/449.png,€11.5M,€23K,1938,66,69,78,64,73,74,78,67,64,41,49,15,16,9,10,14,83,80,82,73,49,75,35,67,76,76,52,84,69,70,76,74,68,36,68.0,77.0,75.0,65.0,71.0,201143,68.0,76.0,77.0,71.0,75.0,65.0,69.0,63.0,67.0,75.0,CB,68.0,76.0,77.0,71.0,75.0,65.0,69.0,63.0,67.0,75.0,63.0
777,A. Marchesín,29,https://cdn.sofifa.org/48/18/players/201095.png,Argentina,https://cdn.sofifa.org/flags/52.png,78,79,Club América,https://cdn.sofifa.org/24/18/teams/1879.png,€8M,€45K,1270,49,27,68,60,20,49,18,16,24,14,15,73,75,74,80,81,11,15,75,34,12,14,14,18,75,37,12,15,52,38,15,59,65,15,,,,,,201095,,,,,,,,,,,GK,,,,,,,,,,,
820,A. Marchesín,29,https://cdn.sofifa.org/48/18/players/201095.png,Argentina,https://cdn.sofifa.org/flags/52.png,78,79,Club América,https://cdn.sofifa.org/24/18/teams/1879.png,€8M,€45K,1270,49,27,68,60,20,49,18,16,24,14,15,73,75,74,80,81,11,15,75,34,12,14,14,18,75,37,12,15,52,38,15,59,65,15,,,,,,201095,,,,,,,,,,,GK,,,,,,,,,,,
872,A. Onana,21,https://cdn.sofifa.org/48/18/players/226753.png,Cameroon,https://cdn.sofifa.org/flags/103.png,78,85,Ajax,https://cdn.sofifa.org/24/18/teams/245.png,€11M,€8K,1352,64,23,68,53,38,33,15,30,26,12,17,83,77,84,70,77,20,15,80,26,17,16,27,11,76,33,21,14,62,39,18,67,60,13,,,,,,226753,,,,,,,,,,,GK,,,,,,,,,,,
700,A. Onana,21,https://cdn.sofifa.org/48/18/players/226753.png,Cameroon,https://cdn.sofifa.org/flags/103.png,78,85,Ajax,https://cdn.sofifa.org/24/18/teams/245.png,€11M,€8K,1352,64,23,68,53,38,33,15,30,26,12,17,83,77,84,70,77,20,15,80,26,17,16,27,11,76,33,21,14,62,39,18,67,60,13,,,,,,226753,,,,,,,,,,,GK,,,,,,,,,,,
736,A. Szymanowski,28,https://cdn.sofifa.org/48/18/players/210333.png,Argentina,https://cdn.sofifa.org/flags/52.png,78,78,CD Leganés,https://cdn.sofifa.org/24/18/teams/100888.png,€10.5M,€29K,1920,91,55,84,87,74,71,78,69,78,77,57,11,8,10,12,6,67,28,56,63,68,33,70,75,72,74,68,45,86,74,42,60,76,66,76.0,50.0,56.0,76.0,69.0,210333,76.0,59.0,50.0,69.0,56.0,76.0,77.0,74.0,77.0,62.0,LW CM LM,76.0,59.0,50.0,69.0,56.0,76.0,77.0,74.0,77.0,62.0,74.0
837,A. Szymanowski,28,https://cdn.sofifa.org/48/18/players/210333.png,Argentina,https://cdn.sofifa.org/flags/52.png,78,78,CD Leganés,https://cdn.sofifa.org/24/18/teams/100888.png,€10.5M,€29K,1920,91,55,84,87,74,71,78,69,78,77,57,11,8,10,12,6,67,28,56,63,68,33,70,75,72,74,68,45,86,74,42,60,76,66,76.0,50.0,56.0,76.0,69.0,210333,76.0,59.0,50.0,69.0,56.0,76.0,77.0,74.0,77.0,62.0,LW CM LM,76.0,59.0,50.0,69.0,56.0,76.0,77.0,74.0,77.0,62.0,74.0
784,André André,27,https://cdn.sofifa.org/48/18/players/199626.png,Portugal,https://cdn.sofifa.org/flags/38.png,78,78,FC Porto,https://cdn.sofifa.org/24/18/teams/236.png,€10.5M,€16K,2079,73,77,74,74,80,80,65,78,79,69,61,9,11,15,9,7,64,77,70,78,71,62,84,77,76,80,75,65,69,75,75,61,75,64,77.0,70.0,75.0,76.0,77.0,199626,77.0,72.0,70.0,77.0,75.0,76.0,75.0,73.0,75.0,73.0,CM,77.0,72.0,70.0,77.0,75.0,76.0,75.0,73.0,75.0,73.0,73.0
875,André André,27,https://cdn.sofifa.org/48/18/players/199626.png,Portugal,https://cdn.sofifa.org/flags/38.png,78,78,FC Porto,https://cdn.sofifa.org/24/18/teams/236.png,€10.5M,€16K,2079,73,77,74,74,80,80,65,78,79,69,61,9,11,15,9,7,64,77,70,78,71,62,84,77,76,80,75,65,69,75,75,61,75,64,77.0,70.0,75.0,76.0,77.0,199626,77.0,72.0,70.0,77.0,75.0,76.0,75.0,73.0,75.0,73.0,CM,77.0,72.0,70.0,77.0,75.0,76.0,75.0,73.0,75.0,73.0,73.0


There are 52 duplicates rows, with all their field identical

Aditionally, we will check other duplicates based on player names (checking also the age, nacionality and club)

In [17]:
df_players_nodup = df_players.drop_duplicates()
df_players_nodup[df_players_nodup.duplicated(['Name','Age','Nationality','Club'],keep=False)] \
    .sort_values(by=['Name','Age','Nationality','Club'])

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Acceleration,Aggression,Agility,Balance,Ball control,Composure,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,GK diving,GK handling,GK kicking,GK positioning,GK reflexes,Heading accuracy,Interceptions,Jumping,Long passing,Long shots,Marking,Penalties,Positioning,Reactions,Short passing,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,CAM,CB,CDM,CF,CM,ID,LAM,LB,LCB,LCM,LDM,LF,LM,LS,LW,LWB,Preferred Positions,RAM,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
8306,A. Ajeti,20,https://cdn.sofifa.org/48/18/players/222352.png,Switzerland,https://cdn.sofifa.org/flags/47.png,67,74,FC St. Gallen,https://cdn.sofifa.org/24/18/teams/898.png,€1.1M,€9K,1590,76,29,67,62,66,74,57,46,65,66,33,13,9,6,12,16,66,16,83,41,56,19,56,69,65,45,63,17,82,64,19,80,61,65,61.0,39.0,40.0,65.0,53.0,222352,61.0,43.0,39.0,53.0,40.0,65.0,63.0,66.0,64.0,45.0,RM CF ST,61.0,43.0,39.0,53.0,40.0,65.0,63.0,66.0,64.0,45.0,66.0
17054,A. Ajeti,20,https://cdn.sofifa.org/48/18/players/239427.png,Switzerland,https://cdn.sofifa.org/flags/47.png,54,61,FC St. Gallen,https://cdn.sofifa.org/24/18/teams/898.png,€80K,€1K,1120,38,63,44,32,29,50,25,18,24,23,24,14,13,10,9,10,59,46,46,29,22,48,36,14,45,31,42,62,50,47,58,63,19,27,28.0,53.0,43.0,29.0,31.0,239427,28.0,46.0,53.0,31.0,43.0,29.0,30.0,34.0,29.0,43.0,CB,28.0,46.0,53.0,31.0,43.0,29.0,30.0,34.0,29.0,43.0,34.0
2214,A. Miranchuk,21,https://cdn.sofifa.org/48/18/players/214092.png,Russia,https://cdn.sofifa.org/flags/40.png,74,81,Lokomotiv Moscow,https://cdn.sofifa.org/24/18/teams/100765.png,€8M,€30K,1762,71,43,77,67,73,67,65,64,76,70,53,8,8,9,12,15,53,36,58,71,68,22,57,66,68,75,65,29,75,74,28,68,76,62,73.0,44.0,54.0,71.0,69.0,214092,73.0,52.0,44.0,69.0,54.0,71.0,72.0,68.0,72.0,56.0,ST CAM,73.0,52.0,44.0,69.0,54.0,71.0,72.0,68.0,72.0,56.0,68.0
7324,A. Miranchuk,21,https://cdn.sofifa.org/48/18/players/222368.png,Russia,https://cdn.sofifa.org/flags/40.png,68,77,Lokomotiv Moscow,https://cdn.sofifa.org/24/18/teams/100765.png,€1.4M,€18K,1642,66,31,69,70,68,64,62,58,72,61,43,8,6,11,12,16,44,22,49,63,60,25,71,63,72,67,66,31,74,72,30,68,66,46,67.0,41.0,49.0,67.0,63.0,222368,67.0,49.0,41.0,63.0,49.0,67.0,67.0,64.0,67.0,53.0,CAM,67.0,49.0,41.0,63.0,49.0,67.0,67.0,64.0,67.0,53.0,64.0
14380,D. Kelly-Evans,20,https://cdn.sofifa.org/48/18/players/226056.png,England,https://cdn.sofifa.org/flags/14.png,60,74,Coventry City,https://cdn.sofifa.org/24/18/teams/1800.png,€400K,€2K,1624,84,85,80,80,58,52,51,31,56,36,39,7,8,10,6,8,48,54,86,48,38,50,52,46,55,52,54,61,83,64,57,62,40,35,52.0,59.0,56.0,53.0,51.0,226056,52.0,59.0,59.0,51.0,56.0,53.0,56.0,52.0,55.0,59.0,RWB RB,52.0,59.0,59.0,51.0,56.0,53.0,56.0,52.0,55.0,59.0,52.0
16391,D. Kelly-Evans,20,https://cdn.sofifa.org/48/18/players/226055.png,England,https://cdn.sofifa.org/flags/14.png,56,71,Coventry City,https://cdn.sofifa.org/24/18/teams/1800.png,€200K,€2K,1512,83,82,77,66,46,58,54,44,59,43,40,12,8,8,10,9,28,38,81,48,32,34,55,49,36,53,57,34,84,47,37,57,55,46,52.0,44.0,45.0,52.0,48.0,226055,52.0,46.0,44.0,48.0,45.0,52.0,55.0,49.0,54.0,47.0,CAM LM,52.0,46.0,44.0,48.0,45.0,52.0,55.0,49.0,54.0,47.0,49.0


These rows are players with the same surname, but they are different players

## Clean dataset based on the information from the previous analysis

1) Clean duplicated rows (52)

2) Remove players without 'Club' information (free agents) as they have also 'Value' equal to 0 €

3) Transform variables 'Value' & 'Wage' from string to numeric type:
    * Transform: M // x 1.000.000
    * Transform: K // x 1.000 
    * Remove: €, M, K

4) Transform 'Preferred Positions' field into a single value. Keep only the first position
    
5) Remove variables with a high percentaje of missing / extreme values / correlated:
    * CAM, CB, CDM, CF, CM, LAM, LB, LCB, LCM, LDM, LF, LM, LS, LW, LWB, RAM, RB, RCB, RCM, RDM, RF, RM, RS, RW, RWB, ST
** These variables are correlated with "Preferred Positions", when Preferred Positions = 'GK' // variables have a null value

6) Remove variables with useless information:
    * Photo, Flag, Club Logo

7) Remove skill's variables, to reduce the number of variables
    * To simplify the problem we will keep only 'Overall' and 'Potential' as skill's related info


In [18]:
print(df_players.shape)
# 1 - Duplicates
df_players_processed = df_players.drop_duplicates()

# 2 - Missing Club
df_players_processed = df_players_processed[df_players_processed.Club.notnull()]

# 3 - Transform variables from string to numeric
df_players_processed['Value_num'] = pd.to_numeric(df_players_processed['Value'].\
                                                  str.replace('€','').str.replace('K','').str.replace('M',''))
df_players_processed['Value_mul'] = df_players_processed['Value'].str.replace('€','').str.replace('.','').str.replace('\d+', '')
df_players_processed['Value_mul'] = pd.to_numeric(df_players_processed['Value_mul'].\
                                                  str.replace('M','1000000').str.replace('K','1000')).fillna(1)
df_players_processed['Value'] = (df_players_processed['Value_num'] * df_players_processed['Value_mul']).astype("int")
df_players_processed = df_players_processed.drop(['Value_num','Value_mul'], axis=1)

df_players_processed['Wage_num'] = pd.to_numeric(df_players_processed['Wage'].\
                                                 str.replace('€','').str.replace('K','').str.replace('M',''))
df_players_processed['Wage_mul'] = df_players_processed['Wage'].str.replace('€','').str.replace('.','').str.replace('\d+', '')
df_players_processed['Wage_mul'] = pd.to_numeric(df_players_processed['Wage_mul'].\
                                                 str.replace('M','1000000').str.replace('K','1000')).fillna(1)
df_players_processed['Wage'] = (df_players_processed['Wage_num'] * df_players_processed['Wage_mul']).astype("int")
df_players_processed = df_players_processed.drop(['Wage_num','Wage_mul'], axis=1)

# 4 - Transform 'Preferred Positions' field
df_players_processed['Preferred Positions'] = df_players_processed['Preferred Positions'].str.split(n=1,expand=False)
df_players_processed['Preferred Position'] = df_players_processed['Preferred Positions'].str[0]

# 5 - Remove variables (missings, extreme, correlated)
df_players_processed.drop(["CAM","CB","CDM","CF","CM","LAM","LB","LCB","LCM","LDM","LF","LM","LS","LW","LWB","RAM","RB",\
                           "RCB","RCM","RDM","RF","RM","RS","RW","RWB","ST"],axis=1,inplace=True)

# 6 - Remove useless variables
df_players_processed.drop(['Photo','Flag','Club Logo'],axis=1,inplace=True)

# 7 - Reduce the number of variables
df_players_processed = df_players_processed[["ID","Name","Age","Nationality","Overall","Potential",
                                             "Club","Value","Wage","Preferred Position"]]

print(df_players_processed.shape)
df_players_processed.sort_values(by=['ID']).head()

(17981, 74)
(17681, 10)


Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Position
4920,16,Luis García,36,Spain,70,70,KAS Eupen,575000,7000,CAM
4963,28,Manu Herrera,35,Spain,70,70,CA Osasuna,425000,4000,GK
45,41,Iniesta,33,Spain,87,87,FC Barcelona,29500000,260000,LM
693,80,E. Belözoğlu,36,Turkey,79,79,İstanbul Başakşehir FK,4000000,26000,CDM
16121,591,C. Day,41,England,57,57,Stevenage,10000,1000,GK


## Join information about Languages associated to each country

Extract data about country-languages, previously loaded into our Cloud SQL database

In [19]:
df_languages = pd.read_sql('SELECT * FROM ea_countries_languages', con=database_connection)
df_players = df_players_processed

In [20]:
df_languages_primary = df_languages[['country','language_0']].\
    rename(columns = {'country':'Nationality','language_0':'Primary Language'})

In [21]:
df_player_lang=pd.merge(df_players, df_languages_primary, how='left', left_on=['Nationality'], right_on=['Nationality'])

In [22]:
df_player_lang.head()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Position,Primary Language
0,20801,Cristiano Ronaldo,32,Portugal,94,94,Real Madrid CF,95500000,565000,ST,Portuguese
1,158023,L. Messi,30,Argentina,93,93,FC Barcelona,105000000,565000,RW,Spanish
2,190871,Neymar,25,Brazil,92,94,Paris Saint-Germain,123000000,280000,LW,Portuguese
3,176580,L. Suárez,30,Uruguay,92,92,FC Barcelona,97000000,510000,ST,Spanish
4,167495,M. Neuer,31,Germany,92,92,FC Bayern Munich,61000000,230000,GK,German


Check if all players have a primary language associated

In [23]:
df_player_lang.isnull().sum()

ID                       0
Name                     0
Age                      0
Nationality              0
Overall                  0
Potential                0
Club                     0
Value                    0
Wage                     0
Preferred Position       0
Primary Language      3086
dtype: int64

Find 'Nationalities' associated to players without a 'Primary Language' identified

In [24]:
df_player_lang[df_player_lang['Primary Language'].isnull()].Nationality.unique()

array(['Wales', 'England', 'Bosnia Herzegovina', 'Korea Republic',
       'DR Congo', 'Republic of Ireland', 'Northern Ireland', 'Scotland',
       'FYR Macedonia', 'Guinea Bissau', 'Congo', 'Trinidad & Tobago',
       'Korea DPR', 'Central African Rep.', 'St Kitts Nevis', 'China PR',
       'Antigua & Barbuda', 'São Tomé & Príncipe', 'St Lucia',
       'Brunei Darussalam'], dtype=object)

Rename some nationalities based on the countries file:

'Wales', 'England', 'Bosnia Herzegovina', 'Korea Republic', 'DR Congo', 'Republic of Ireland', 'Northern Ireland', 'Scotland',
'FYR Macedonia', 'Guinea Bissau', 'Congo', 'Trinidad & Tobago', 'Korea DPR', 'Central African Rep.', 'St Kitts Nevis', 'China PR', 'Antigua & Barbuda', 'São Tomé & Príncipe', 'St Lucia', 'Brunei Darussalam'

In [25]:
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Wales$','United Kingdom')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^England$','United Kingdom')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Bosnia Herzegovina$','Bosnia and Herzegovina')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Korea Republic$','South Korea')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Republic of Ireland$','Ireland')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Northern Ireland$', 'United Kingdom')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Scotland$','United Kingdom')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^FYR Macedonia$','North Macedonia')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Guinea Bissau$','Guinea-Bissau')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Trinidad & Tobago$','Trinidad and Tobago')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Korea DPR$','North Korea')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Central African Rep.$','Central African Republic')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^St Kitts Nevis$','Saint Kitts and Nevis')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^China PR$','China')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Antigua & Barbuda$','Antigua and Barbuda')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^São Tomé & Príncipe$','São Tomé and Príncipe')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^St Lucia$','Saint Lucia')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Brunei Darussalam$','Brunei')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^Congo$','Republic of the Congo')
df_players['Nationality'] = df_players['Nationality'].str.replace(r'^DR Congo$','Democratic Republic of the Congo')

In [26]:
df_player_lang=pd.merge(df_players, df_languages_primary, how='left', left_on=['Nationality'], right_on=['Nationality'])
df_player_lang.isnull().sum()

ID                    0
Name                  0
Age                   0
Nationality           0
Overall               0
Potential             0
Club                  0
Value                 0
Wage                  0
Preferred Position    0
Primary Language      0
dtype: int64

Load processed data into Cloud SQL database

In [27]:
df_player_lang.to_sql(con=database_connection, name='ea_players_language', if_exists='replace',index=False)

## Generate Final Data Quality Report

In [28]:
pf.ProfileReport(df_player_lang)

0,1
Number of variables,11
Number of observations,17681
Total Missing (%),0.0%
Total size in memory,1.6 MiB
Average record size in memory,96.0 B

0,1
Numeric,6
Categorical,5
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,29
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,25.11
Minimum,16
Maximum,47
Zeros (%),0.0%

0,1
Minimum,16
5-th percentile,18
Q1,21
Median,25
Q3,28
95-th percentile,33
Maximum,47
Range,31
Interquartile range,7

0,1
Standard deviation,4.6146
Coef of variation,0.18378
Kurtosis,-0.46676
Mean,25.11
MAD,3.8055
Skewness,0.39532
Sum,443973
Variance,21.295
Memory size,276.3 KiB

Value,Count,Frequency (%),Unnamed: 3
25,1488,8.4%,
23,1366,7.7%,
24,1317,7.4%,
22,1308,7.4%,
21,1276,7.2%,
20,1238,7.0%,
26,1184,6.7%,
27,1126,6.4%,
29,1096,6.2%,
19,1066,6.0%,

Value,Count,Frequency (%),Unnamed: 3
16,13,0.1%,
17,258,1.5%,
18,671,3.8%,
19,1066,6.0%,
20,1238,7.0%,

Value,Count,Frequency (%),Unnamed: 3
40,7,0.0%,
41,3,0.0%,
43,2,0.0%,
44,2,0.0%,
47,1,0.0%,

0,1
Distinct count,647
Unique (%),3.7%
Missing (%),0.0%
Missing (n),0

0,1
Olympique Lyonnais,33
Chelsea,33
OGC Nice,33
Other values (644),17582

Value,Count,Frequency (%),Unnamed: 3
Olympique Lyonnais,33,0.2%,
Chelsea,33,0.2%,
OGC Nice,33,0.2%,
VfL Wolfsburg,33,0.2%,
AS Monaco,33,0.2%,
Everton,33,0.2%,
ES Troyes AC,33,0.2%,
Brighton & Hove Albion,33,0.2%,
Bournemouth,33,0.2%,
Arsenal,33,0.2%,

0,1
Distinct count,17681
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,207640
Minimum,16
Maximum,241219
Zeros (%),0.0%

0,1
Minimum,16
5-th percentile,156300
Q1,192610
Median,214130
Q3,231510
95-th percentile,239810
Maximum,241219
Range,241203
Interquartile range,38902

0,1
Standard deviation,32436
Coef of variation,0.15621
Kurtosis,8.9999
Mean,207640
MAD,23388
Skewness,-2.3035
Sum,3671341698
Variance,1052100000
Memory size,916.3 KiB

Value,Count,Frequency (%),Unnamed: 3
231423,1,0.0%,
238651,1,0.0%,
238225,1,0.0%,
211602,1,0.0%,
237430,1,0.0%,
230037,1,0.0%,
223896,1,0.0%,
202404,1,0.0%,
215708,1,0.0%,
234151,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
16,1,0.0%,
28,1,0.0%,
41,1,0.0%,
80,1,0.0%,
591,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
241207,1,0.0%,
241213,1,0.0%,
241216,1,0.0%,
241218,1,0.0%,
241219,1,0.0%,

0,1
Distinct count,16747
Unique (%),94.7%
Missing (%),0.0%
Missing (n),0

0,1
J. Williams,7
J. Valencia,7
J. Rodríguez,7
Other values (16744),17660

Value,Count,Frequency (%),Unnamed: 3
J. Williams,7,0.0%,
J. Valencia,7,0.0%,
J. Rodríguez,7,0.0%,
Paulinho,6,0.0%,
Wanderson,6,0.0%,
J. Jones,6,0.0%,
Felipe,6,0.0%,
J. Martínez,5,0.0%,
L. Rodríguez,5,0.0%,
L. Martínez,5,0.0%,

0,1
Distinct count,161
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0

0,1
United Kingdom,2137
Germany,1135
Spain,1009
Other values (158),13400

Value,Count,Frequency (%),Unnamed: 3
United Kingdom,2137,12.1%,
Germany,1135,6.4%,
Spain,1009,5.7%,
France,973,5.5%,
Argentina,961,5.4%,
Brazil,806,4.6%,
Italy,797,4.5%,
Colombia,591,3.3%,
Japan,469,2.7%,
Netherlands,429,2.4%,

0,1
Distinct count,49
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,66.198
Minimum,46
Maximum,94
Zeros (%),0.0%

0,1
Minimum,46
5-th percentile,54
Q1,62
Median,66
Q3,71
95-th percentile,77
Maximum,94
Range,48
Interquartile range,9

0,1
Standard deviation,6.9826
Coef of variation,0.10548
Kurtosis,0.0064208
Mean,66.198
MAD,5.5283
Skewness,0.012039
Sum,1170450
Variance,48.756
Memory size,276.3 KiB

Value,Count,Frequency (%),Unnamed: 3
66,1086,6.1%,
67,1056,6.0%,
64,1047,5.9%,
68,1009,5.7%,
65,1002,5.7%,
63,903,5.1%,
69,892,5.0%,
70,885,5.0%,
71,824,4.7%,
62,812,4.6%,

Value,Count,Frequency (%),Unnamed: 3
46,8,0.0%,
47,16,0.1%,
48,23,0.1%,
49,49,0.3%,
50,115,0.7%,

Value,Count,Frequency (%),Unnamed: 3
90,5,0.0%,
91,1,0.0%,
92,3,0.0%,
93,1,0.0%,
94,1,0.0%,

0,1
Distinct count,48
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,71.182
Minimum,46
Maximum,94
Zeros (%),0.0%

0,1
Minimum,46
5-th percentile,62
Q1,67
Median,71
Q3,75
95-th percentile,82
Maximum,94
Range,48
Interquartile range,8

0,1
Standard deviation,6.0884
Coef of variation,0.085533
Kurtosis,0.17123
Mean,71.182
MAD,4.8165
Skewness,0.2139
Sum,1258564
Variance,37.069
Memory size,276.3 KiB

Value,Count,Frequency (%),Unnamed: 3
70,1210,6.8%,
72,1196,6.8%,
71,1157,6.5%,
69,1133,6.4%,
68,1083,6.1%,
73,1062,6.0%,
67,1054,6.0%,
74,1010,5.7%,
66,897,5.1%,
75,874,4.9%,

Value,Count,Frequency (%),Unnamed: 3
46,2,0.0%,
48,1,0.0%,
49,2,0.0%,
50,1,0.0%,
51,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
90,18,0.1%,
91,8,0.0%,
92,10,0.1%,
93,3,0.0%,
94,4,0.0%,

0,1
Distinct count,15
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
CB,2652
ST,2255
GK,1982
Other values (12),10792

Value,Count,Frequency (%),Unnamed: 3
CB,2652,15.0%,
ST,2255,12.8%,
GK,1982,11.2%,
CM,1949,11.0%,
CDM,1362,7.7%,
RM,1324,7.5%,
LM,1305,7.4%,
LB,1291,7.3%,
RB,1185,6.7%,
CAM,1108,6.3%,

0,1
Distinct count,58
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
Spanish,3657
English,3281
German,1636
Other values (55),9107

Value,Count,Frequency (%),Unnamed: 3
Spanish,3657,20.7%,
English,3281,18.6%,
German,1636,9.3%,
French,1434,8.1%,
Portuguese,1222,6.9%,
Italian,798,4.5%,
Dutch,713,4.0%,
Arabic,561,3.2%,
Japanese,469,2.7%,
Irish,417,2.4%,

0,1
Distinct count,207
Unique (%),1.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2390700
Minimum,0
Maximum,123000000
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,100000
Q1,325000
Median,700000
Q3,2100000
95-th percentile,10000000
Maximum,123000000
Range,123000000
Interquartile range,1775000

0,1
Standard deviation,5365700
Coef of variation,2.2444
Kurtosis,82.173
Mean,2390700
MAD,2690000
Skewness,7.1281
Sum,42269189950
Variance,28791000000000
Memory size,276.3 KiB

Value,Count,Frequency (%),Unnamed: 3
1100000,387,2.2%,
425000,362,2.0%,
350000,358,2.0%,
1200000,343,1.9%,
375000,338,1.9%,
525000,328,1.9%,
325000,315,1.8%,
400000,305,1.7%,
1000000,303,1.7%,
475000,298,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,8,0.0%,
10000,13,0.1%,
20000,16,0.1%,
30000,36,0.2%,
40000,67,0.4%,

Value,Count,Frequency (%),Unnamed: 3
92000000,1,0.0%,
95500000,1,0.0%,
97000000,1,0.0%,
105000000,1,0.0%,
123000000,1,0.0%,

0,1
Distinct count,141
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,11647
Minimum,1000
Maximum,565000
Zeros (%),0.0%

0,1
Minimum,1000
5-th percentile,1000
Q1,2000
Median,4000
Q3,12000
95-th percentile,46000
Maximum,565000
Range,564000
Interquartile range,10000

0,1
Standard deviation,23179
Coef of variation,1.99
Kurtosis,89.569
Mean,11647
MAD,12033
Skewness,7.0537
Sum,205936000
Variance,537250000
Memory size,276.3 KiB

Value,Count,Frequency (%),Unnamed: 3
1000,4199,23.7%,
2000,2279,12.9%,
3000,1535,8.7%,
4000,1196,6.8%,
5000,924,5.2%,
6000,776,4.4%,
7000,612,3.5%,
8000,543,3.1%,
9000,412,2.3%,
10000,380,2.1%,

Value,Count,Frequency (%),Unnamed: 3
1000,4199,23.7%,
2000,2279,12.9%,
3000,1535,8.7%,
4000,1196,6.8%,
5000,924,5.2%,

Value,Count,Frequency (%),Unnamed: 3
340000,2,0.0%,
355000,1,0.0%,
370000,1,0.0%,
510000,1,0.0%,
565000,2,0.0%,

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Position,Primary Language
0,20801,Cristiano Ronaldo,32,Portugal,94,94,Real Madrid CF,95500000,565000,ST,Portuguese
1,158023,L. Messi,30,Argentina,93,93,FC Barcelona,105000000,565000,RW,Spanish
2,190871,Neymar,25,Brazil,92,94,Paris Saint-Germain,123000000,280000,LW,Portuguese
3,176580,L. Suárez,30,Uruguay,92,92,FC Barcelona,97000000,510000,ST,Spanish
4,167495,M. Neuer,31,Germany,92,92,FC Bayern Munich,61000000,230000,GK,German
