## Data Loading

In [1]:
import pandas as pd
data=pd.read_csv(r'../data/Football_players_data.csv')
data.head(3)

Unnamed: 0,name,Age,Overall rating,Potential,Team & Contract,ID,foot,Best overall,Best position,Value,...,GK Positioning,GK Reflexes,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace
0,15 Ronaldinho CAM ST,34,78,78,Querétaro\n2014 ~ 2016,28130,Right,77,CAM,€4.1M,...,6,14,1875,377,3,49,72,83,84,28
1,O. Hutchinson RM CAM,19,65,81,Ipswich Town\n2022 ~ 2025,260145,Left,67,CAM,€1.8M,...,5,8,1651,351,1,72,54,61,70,44
2,Brahim CAM,23,82,86,Real Madrid\n2019 ~ 2027,231410,Right,84,CAM,€43.5M,...,10,6,1897,409,1,85,74,78,85,31


## Data preprocessing

#### Remove duplicated row

In [2]:
print("No of duplicated rows: ",data.duplicated().sum())
data=data.drop_duplicates()
print("No of duplicated rows after droping dupliactes: ",data.duplicated().sum())

No of duplicated rows:  371
No of duplicated rows after droping dupliactes:  0


#### Removing Positions and number from name column

In [3]:
data['name']=data['name'].replace(data['Best position'].unique(),'',regex=True) #Replace Positions in name column with empty string
data['name']=data['name'].str.replace(r'\d+','',regex=True) #replace position numbers in name column with empty string
data['name']=data['name'].str.strip() #remove leading and trailing spaces
data.head(3)

Unnamed: 0,name,Age,Overall rating,Potential,Team & Contract,ID,foot,Best overall,Best position,Value,...,GK Positioning,GK Reflexes,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace
0,Ronaldinho,34,78,78,Querétaro\n2014 ~ 2016,28130,Right,77,CAM,€4.1M,...,6,14,1875,377,3,49,72,83,84,28
1,O. Hutchinson,19,65,81,Ipswich Town\n2022 ~ 2025,260145,Left,67,CAM,€1.8M,...,5,8,1651,351,1,72,54,61,70,44
2,Brahim,23,82,86,Real Madrid\n2019 ~ 2027,231410,Right,84,CAM,€43.5M,...,10,6,1897,409,1,85,74,78,85,31


#### Adding and Subtracting the additional ratings in Overall rating and Potential column

In [4]:
def evaluate_expression(value):
    if "+" in value:
        parts=value.split("+")
        return int(parts[0])+int(parts[1])
    elif '-' in value:
        parts=value.split("-")
        return int(parts[0])-int(parts[1])
    else:
        return int(value)
    
data['Overall rating']=data['Overall rating'].apply(evaluate_expression)
data['Potential']=data['Potential'].apply(evaluate_expression)
data.sample(3)

Unnamed: 0,name,Age,Overall rating,Potential,Team & Contract,ID,foot,Best overall,Best position,Value,...,GK Positioning,GK Reflexes,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace
2968,A. Motaraghebjafarpour,20,59,72,Kalmar\n2023 ~ 2025,275004,Right,60,RWB,€525K,...,14,13,1544,341,1,75,40,50,61,51
2941,F. Watson,20,69,80,Lanús\n2023 ~ 2027,272911,Right,71,CAM,€3.1M,...,13,5,1734,377,1,69,62,68,67,55
407,P. Kalulu,23,78,85,Milan\n2020 ~ 2027,255654,Right,80,CB,€26M,...,9,11,1960,421,1,80,50,68,69,79


#### Seperating Team and Contract column

In [5]:
data[['Team','Contract']]=data['Team & Contract'].str.split('\n',n=1,expand=True)
data=data.drop(columns=['Team & Contract','ID'])
data.sample(3)

Unnamed: 0,name,Age,Overall rating,Potential,foot,Best overall,Best position,Value,Wage,Release clause,...,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace,Team,Contract
1251,D. Ozoh,18,61,81,Right,63,CB,€850K,€2K,€2.5M,...,1691,365,1,67,53,55,63,58,Crystal Palace,2022 ~ 2024
590,C. Volpato,19,69,85,Left,71,CAM,€3.6M,€5K,€8.4M,...,1718,370,1,67,64,67,71,39,Sassuolo,2023 ~ 2025
2297,S. Soumano,22,68,75,Right,70,ST,€2.5M,€9K,€0,...,1556,337,1,69,69,52,69,25,Quevilly Rouen,"Jun 30, 2024 On loan"


In [6]:
def parse_euro(value):
    if 'M' in value:
        return float(value.replace("€",'').replace('M',""))*1_000_000
    elif 'K' in value:
        return float(value.replace("€",'').replace('K',""))*1_000
    else:
        return float(value.replace("€",''))

#### parsing M and K to numbers

In [7]:
data[['Value', 'Wage', 'Release clause']]=data[['Value', 'Wage', 'Release clause']].map(parse_euro)
data.head(3)

Unnamed: 0,name,Age,Overall rating,Potential,foot,Best overall,Best position,Value,Wage,Release clause,...,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace,Team,Contract
0,Ronaldinho,34,78,78,Right,77,CAM,4100000.0,45000.0,0.0,...,1875,377,3,49,72,83,84,28,Querétaro,2014 ~ 2016
1,O. Hutchinson,19,65,81,Left,67,CAM,1800000.0,3000.0,4500000.0,...,1651,351,1,72,54,61,70,44,Ipswich Town,2022 ~ 2025
2,Brahim,23,82,86,Right,84,CAM,43500000.0,145000.0,92400000.0,...,1897,409,1,85,74,78,85,31,Real Madrid,2019 ~ 2027


#### int conversion and arithmetic operation for stats columns

In [8]:
def evaluate_expression(value):
    try:
        if "+" in value:
            parts=value.split("+")
            return int(parts[0])+int(parts[1])
        elif '-' in value:
            parts=value.split("-")
            return int(parts[0])-int(parts[1])
        else:
            return int(value)
    except ValueError:
        return None

In [9]:
columns_to_do=['Total attacking',
       'Crossing', 'Finishing', 'Heading accuracy', 'Short passing', 'Volleys',
       'Total skill', 'Dribbling', 'Curve', 'FK Accuracy', 'Long passing',
       'Ball control', 'Total movement', 'Acceleration', 'Sprint speed',
       'Agility', 'Reactions', 'Balance', 'Total power', 'Shot power',
       'Jumping', 'Stamina', 'Strength', 'Long shots', 'Total mentality',
       'Aggression', 'Interceptions', 'Att. Position', 'Vision', 'Penalties',
       'Composure', 'Total defending', 'Defensive awareness',
       'Standing tackle', 'Sliding tackle', 'Total goalkeeping', 'GK Diving',
       'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes',
       'Total stats', 'Base stats', 'International reputation',
       'Pace / Diving', 'Shooting / Handling', 'Passing / Kicking',
       'Dribbling / Reflexes', 'Defending / Pace']

data[columns_to_do]=data[columns_to_do].astype(str).map(evaluate_expression)
data.sample(3)

Unnamed: 0,name,Age,Overall rating,Potential,foot,Best overall,Best position,Value,Wage,Release clause,...,Total stats,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace,Team,Contract
283,E. Millot,20,75,83,Left,77,CAM,12500000.0,17000.0,23800000.0,...,1921,405,1,73,67,71,79,61,VfB Stuttgart,2021 ~ 2028
2843,Grafite RF,31,80,84,Right,80,ST,0.0,0.0,0.0,...,1937,411,3,77,81,64,75,33,VfL Wolfsburg,2007 ~ 2012
1943,A. Ounas,26,74,74,Left,74,RM,4600000.0,27000.0,8700000.0,...,1909,405,2,83,66,70,79,44,LOSC Lille,2022 ~ 2024


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2749 entries, 0 to 3111
Data columns (total 61 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2749 non-null   object 
 1   Age                       2749 non-null   int64  
 2   Overall rating            2749 non-null   int64  
 3   Potential                 2749 non-null   int64  
 4   foot                      2749 non-null   object 
 5   Best overall              2749 non-null   int64  
 6   Best position             2749 non-null   object 
 7   Value                     2749 non-null   float64
 8   Wage                      2749 non-null   float64
 9   Release clause            2749 non-null   float64
 10  Total attacking           2749 non-null   int64  
 11  Crossing                  2749 non-null   int64  
 12  Finishing                 2748 non-null   float64
 13  Heading accuracy          2748 non-null   float64
 14  Short passing

#### Verify null values

In [11]:
data_NA=data.isna().sum().reset_index().head(50)
data_NA 

Unnamed: 0,index,0
0,name,0
1,Age,0
2,Overall rating,0
3,Potential,0
4,foot,0
5,Best overall,0
6,Best position,0
7,Value,0
8,Wage,0
9,Release clause,0


In [12]:
#### Remove null values
print("data before removing null value",data.shape)
data=data.dropna(axis=0)
print("data after removing null value",data.shape)

data before removing null value (2749, 61)
data after removing null value (2744, 61)


#### Expanding the positions column

In [13]:
data['Best position'].value_counts().reset_index()

Unnamed: 0,Best position,count
0,CAM,540
1,CB,466
2,ST,428
3,CM,212
4,CDM,187
5,GK,183
6,RM,174
7,LM,123
8,RB,103
9,LB,98


In [14]:
position_expansions = {
    "CAM": "Central Attacking Midfielder",
    "CB": "Center Back",
    "ST": "Striker",
    "CM": "Central Midfielder",
    "CDM": "Central Defensive Midfielder",
    "GK": "Goalkeeper",
    "RM": "Right Midfielder",
    "LM": "Left Midfielder",
    "RB": "Right Back",
    "LB": "Left Back",
    "LWB": "Left Wing Back",
    "RWB": "Right Wing Back",
    "RW": "Right Winger",
    "LW": "Left Winger",
    "CF": "Center Forward",
}

In [15]:
data['Best position expansion']=data['Best position'].replace(position_expansions)
data.head(3)

Unnamed: 0,name,Age,Overall rating,Potential,foot,Best overall,Best position,Value,Wage,Release clause,...,Base stats,International reputation,Pace / Diving,Shooting / Handling,Passing / Kicking,Dribbling / Reflexes,Defending / Pace,Team,Contract,Best position expansion
0,Ronaldinho,34,78,78,Right,77,CAM,4100000.0,45000.0,0.0,...,377,3,49,72,83,84,28,Querétaro,2014 ~ 2016,Central Attacking Midfielder
1,O. Hutchinson,19,65,81,Left,67,CAM,1800000.0,3000.0,4500000.0,...,351,1,72,54,61,70,44,Ipswich Town,2022 ~ 2025,Central Attacking Midfielder
2,Brahim,23,82,86,Right,84,CAM,43500000.0,145000.0,92400000.0,...,409,1,85,74,78,85,31,Real Madrid,2019 ~ 2027,Central Attacking Midfielder


#### Droping Columns

In [16]:
data.columns

Index(['name', 'Age', 'Overall rating', 'Potential', 'foot', 'Best overall',
       'Best position', 'Value', 'Wage', 'Release clause', 'Total attacking',
       'Crossing', 'Finishing', 'Heading accuracy', 'Short passing', 'Volleys',
       'Total skill', 'Dribbling', 'Curve', 'FK Accuracy', 'Long passing',
       'Ball control', 'Total movement', 'Acceleration', 'Sprint speed',
       'Agility', 'Reactions', 'Balance', 'Total power', 'Shot power',
       'Jumping', 'Stamina', 'Strength', 'Long shots', 'Total mentality',
       'Aggression', 'Interceptions', 'Att. Position', 'Vision', 'Penalties',
       'Composure', 'Total defending', 'Defensive awareness',
       'Standing tackle', 'Sliding tackle', 'Total goalkeeping', 'GK Diving',
       'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes',
       'Total stats', 'Base stats', 'International reputation',
       'Pace / Diving', 'Shooting / Handling', 'Passing / Kicking',
       'Dribbling / Reflexes', 'Defending / Pace', 'T

In [17]:
data=data[['name', 'Age','Team', 'Overall rating', 'Potential', 'foot', 'Best overall',
       'Best position','Best position expansion', 'Value', 'Wage', 'Release clause' ]]
data.head(3)

Unnamed: 0,name,Age,Team,Overall rating,Potential,foot,Best overall,Best position,Best position expansion,Value,Wage,Release clause
0,Ronaldinho,34,Querétaro,78,78,Right,77,CAM,Central Attacking Midfielder,4100000.0,45000.0,0.0
1,O. Hutchinson,19,Ipswich Town,65,81,Left,67,CAM,Central Attacking Midfielder,1800000.0,3000.0,4500000.0
2,Brahim,23,Real Madrid,82,86,Right,84,CAM,Central Attacking Midfielder,43500000.0,145000.0,92400000.0


## Export the data

In [19]:
data.to_excel('../processed_data/Football_players_data.xlsx',index=False)