In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
dirty_cafe = pd.read_csv('dirty_cafe_sales.csv')
dirty_cafe.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


## Data Wrangling

In [3]:
print("This dataset has {} samples and {} features.".format(dirty_cafe.shape[0], dirty_cafe.shape[1]))

This dataset has 10000 samples and 8 features.


In [4]:
# Check for missing values
missing_values = dirty_cafe.isnull().sum()
missing_values

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [5]:
# Check for duplicates
duplicates = dirty_cafe.duplicated().sum()
duplicates

np.int64(0)

In [7]:
# Check for features missing more than 50% of their values we will drop them as that will not be useful
missing_values[missing_values > dirty_cafe.shape[0]/2] # No features missing more than 50% of their values

# Check for features with only one unique value
unique_values = dirty_cafe.nunique()
unique_values[unique_values == 1]
print("There are {} features with only one unique value.".format(unique_values[unique_values == 1].shape[0]))

There are 0 features with only one unique value.


In [8]:
# Check missing values in the dataset
missing_values = dirty_cafe.isnull().sum()
missing_values

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [11]:
# Item column 
# Check for values in the item column
dirty_cafe['Item'].value_counts()


Item
Juice       1171
Coffee      1165
Salad       1148
Cake        1139
Sandwich    1131
Smoothie    1096
Cookie      1092
Tea         1089
UNKNOWN      344
ERROR        292
Name: count, dtype: int64

In [12]:
# Replace the UNKNOWN and ERROR values with a new item - burger and then the rest with the most frequent item
dirty_cafe['Item'] = dirty_cafe['Item'].replace(['UNKNOWN', 'ERROR'], 'burger')
dirty_cafe['Item'] = dirty_cafe['Item'].replace(np.nan, 'burger')

# Check for missing values in the item column
dirty_cafe['Item'].isnull().sum()

np.int64(0)

In [14]:
# Quantity column
# Convert Quantity to numeric type
dirty_cafe['Quantity'] = pd.to_numeric(dirty_cafe['Quantity'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Quantity'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [15]:
# Quantity column value counts
dirty_cafe['Quantity'].value_counts()

Quantity
5.0    2013
2.0    1974
4.0    1863
3.0    1849
1.0    1822
Name: count, dtype: int64

In [17]:
# Missing values in the Quantity column will be replaced with the median
dirty_cafe['Quantity'] = dirty_cafe['Quantity'].replace(np.nan, dirty_cafe['Quantity'].median())

# Check for missing values in the Quantity column
dirty_cafe['Quantity'].isnull().sum()

np.int64(0)

In [18]:
# Price Per Unit column
# Convert Price Per Unit to numeric type
dirty_cafe['Price Per Unit'] = pd.to_numeric(dirty_cafe['Price Per Unit'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Price Per Unit'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [19]:
# Just like the Quantity column, missing values in the Price Per Unit column will be replaced with the median
dirty_cafe['Price Per Unit'] = dirty_cafe['Price Per Unit'].replace(np.nan, dirty_cafe['Price Per Unit'].median())

# Check for missing values in the Price Per Unit column
dirty_cafe['Price Per Unit'].isnull().sum()

np.int64(0)

In [20]:
# Price Per Unit column value counts
dirty_cafe['Price Per Unit'].value_counts()

Price Per Unit
3.0    2962
4.0    2331
2.0    1227
5.0    1204
1.0    1143
1.5    1133
Name: count, dtype: int64

In [21]:
# Total Spent column
# Convert Total Spent to numeric type
dirty_cafe['Total Spent'] = pd.to_numeric(dirty_cafe['Total Spent'], errors='coerce')
# Check for negative values
dirty_cafe[dirty_cafe['Total Spent'] < 0]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [22]:
# Total Spent column value counts
dirty_cafe['Total Spent'].value_counts()

Total Spent
6.0     979
12.0    939
3.0     930
4.0     923
20.0    746
15.0    734
8.0     677
10.0    524
2.0     497
9.0     479
5.0     468
16.0    444
25.0    259
7.5     237
1.0     232
4.5     225
1.5     205
Name: count, dtype: int64

In [23]:
# Check if there are any values with explicit UNKNOWN or ERROR
dirty_cafe['Total Spent'].value_counts().sort_index()

Total Spent
1.0     232
1.5     205
2.0     497
3.0     930
4.0     923
4.5     225
5.0     468
6.0     979
7.5     237
8.0     677
9.0     479
10.0    524
12.0    939
15.0    734
16.0    444
20.0    746
25.0    259
Name: count, dtype: int64

In [25]:
# Missing values in the Total Spent column will be replaced with the median
dirty_cafe['Total Spent'] = dirty_cafe['Total Spent'].replace(np.nan, dirty_cafe['Total Spent'].median())

# Check for missing values in the Total Spent column
dirty_cafe['Total Spent'].isnull().sum()

np.int64(0)

In [26]:
# Payment Method column
# Check for values in the Payment Method column
dirty_cafe['Payment Method'].value_counts()

Payment Method
Digital Wallet    2291
Credit Card       2273
Cash              2258
ERROR              306
UNKNOWN            293
Name: count, dtype: int64

In [27]:
# Replace the UNKNOWN and ERROR values with a new payment method - crypto and then the rest with the most frequent payment method
dirty_cafe['Payment Method'] = dirty_cafe['Payment Method'].replace(['UNKNOWN', 'ERROR'], 'Crypto')

# Let the rest of the missing values be filled with the most frequent payment method
dirty_cafe['Payment Method'] = dirty_cafe['Payment Method'].replace(np.nan, dirty_cafe['Payment Method'].mode()[0])

# Check for missing values in the Payment Method column
dirty_cafe['Payment Method'].isnull().sum()


np.int64(0)

In [28]:
# Location column
# Check for values in the Location column
dirty_cafe['Location'].value_counts()

Location
Takeaway    3022
In-store    3017
ERROR        358
UNKNOWN      338
Name: count, dtype: int64

In [29]:
# Replace the UNKNOWN and ERROR values with either of the two most frequent locations randomly and then the rest with the most frequent location
dirty_cafe['Location'] = dirty_cafe['Location'].replace(['UNKNOWN', 'ERROR'], dirty_cafe['Location'].value_counts().index[0]) # this picks the most frequent location
dirty_cafe['Location'] = dirty_cafe['Location'].replace(np.nan, dirty_cafe['Location'].mode()[0])

# Check for missing values in the Location column
dirty_cafe['Location'].isnull().sum()

np.int64(0)

In [30]:
# Transaction Date column
# Convert Date to datetime type
dirty_cafe['Transaction Date'] = pd.to_datetime(dirty_cafe['Transaction Date'], errors='coerce')

# Check for missing values in the Transaction Date column
dirty_cafe['Transaction Date'].isnull().sum()

np.int64(460)

In [31]:
# There are still 460 errors in the Transaction Date column. We will replace them with the most frequent date
dirty_cafe['Transaction Date'] = dirty_cafe['Transaction Date'].replace(np.nan, dirty_cafe['Transaction Date'].mode()[0])

# Check for missing values in the Transaction Date column
dirty_cafe['Transaction Date'].isnull().sum()

np.int64(0)

In [36]:
# Check for most frequent date and see the full output rows
pd.set_option('display.max_rows', None) 
dirty_cafe['Transaction Date'].value_counts()

Transaction Date
2023-02-06    500
2023-06-16     40
2023-09-21     39
2023-03-13     39
2023-07-24     39
2023-07-21     39
2023-01-05     38
2023-06-18     37
2023-01-25     37
2023-10-22     37
2023-11-07     36
2023-08-07     36
2023-06-30     36
2023-01-12     36
2023-04-30     36
2023-11-23     36
2023-04-06     35
2023-09-06     35
2023-06-28     35
2023-10-08     35
2023-03-26     35
2023-11-06     34
2023-08-26     34
2023-05-18     34
2023-12-09     34
2023-04-20     34
2023-03-16     34
2023-08-17     33
2023-06-06     33
2023-06-23     33
2023-04-21     33
2023-01-30     33
2023-10-19     33
2023-12-24     33
2023-05-05     33
2023-10-26     33
2023-06-01     33
2023-04-02     33
2023-09-10     33
2023-08-11     33
2023-01-14     32
2023-07-26     32
2023-05-21     32
2023-12-22     32
2023-08-14     32
2023-01-29     32
2023-03-30     32
2023-06-24     32
2023-02-15     32
2023-10-21     32
2023-10-20     32
2023-01-06     31
2023-11-26     31
2023-12-03     31
2023-04-23 

In [37]:
dirty_cafe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    10000 non-null  object        
 1   Item              10000 non-null  object        
 2   Quantity          10000 non-null  float64       
 3   Price Per Unit    10000 non-null  float64       
 4   Total Spent       10000 non-null  float64       
 5   Payment Method    10000 non-null  object        
 6   Location          10000 non-null  object        
 7   Transaction Date  10000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 625.1+ KB


In [38]:
# Save the cleaned dataset
dirty_cafe.to_csv('cleaned_cafe_sales.csv', index=False)

## Task 2: Fifa 21 raw Data Wrangling

In [86]:
pd.set_option('display.max_columns', None)
fifa21 = pd.read_csv('fifa21 raw data v2.csv')
fifa21.head()

Unnamed: 0,ID,Name,LongName,photoUrl,playerUrl,Nationality,Age,↓OVA,POT,Club,Contract,Positions,Height,Weight,Preferred Foot,BOV,Best Position,Joined,Loan Date End,Value,Wage,Release Clause,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits
0,158023,L. Messi,Lionel Messi,https://cdn.sofifa.com/players/158/023/21_60.png,http://sofifa.com/player/158023/lionel-messi/2...,Argentina,33,93,93,\n\n\n\nFC Barcelona,2004 ~ 2021,"RW, ST, CF",170cm,72kg,Left,93,RW,"Jul 1, 2004",,€103.5M,€560K,€138.4M,429,85,95,70,91,88,470,96,93,94,91,96,451,91,80,91,94,95,389,86,68,72,69,94,347,44,40,93,95,75,96,91,32,35,24,54,6,11,15,14,8,2231,466,4 ★,4★,Medium,Low,5 ★,85,92,91,95,38,65,771
1,20801,Cristiano Ronaldo,C. Ronaldo dos Santos Aveiro,https://cdn.sofifa.com/players/020/801/21_60.png,http://sofifa.com/player/20801/c-ronaldo-dos-s...,Portugal,35,92,92,\n\n\n\nJuventus,2018 ~ 2022,"ST, LW",187cm,83kg,Right,92,ST,"Jul 10, 2018",,€63M,€220K,€75.9M,437,84,95,90,82,86,414,88,81,76,77,92,431,87,91,87,95,71,444,94,95,84,78,93,353,63,29,95,82,84,95,84,28,32,24,58,7,11,15,14,11,2221,464,4 ★,5★,High,Low,5 ★,89,93,81,89,35,77,562
2,200389,J. Oblak,Jan Oblak,https://cdn.sofifa.com/players/200/389/21_60.png,http://sofifa.com/player/200389/jan-oblak/210006/,Slovenia,27,91,93,\n\n\n\nAtlético Madrid,2014 ~ 2023,GK,188cm,87kg,Right,91,GK,"Jul 16, 2014",,€120M,€125K,€159.4M,95,13,11,15,43,13,109,12,13,14,40,30,307,43,60,67,88,49,268,59,78,41,78,12,140,34,19,11,65,11,68,57,27,12,18,437,87,92,78,90,90,1413,489,3 ★,1★,Medium,Medium,3 ★,87,92,78,90,52,90,150
3,192985,K. De Bruyne,Kevin De Bruyne,https://cdn.sofifa.com/players/192/985/21_60.png,http://sofifa.com/player/192985/kevin-de-bruyn...,Belgium,29,91,91,\n\n\n\nManchester City,2015 ~ 2023,"CAM, CM",181cm,70kg,Right,91,CAM,"Aug 30, 2015",,€129M,€370K,€161M,407,94,82,55,94,82,441,88,85,83,93,92,398,77,76,78,91,76,408,91,63,89,74,91,408,76,66,88,94,84,91,186,68,65,53,56,15,13,5,10,13,2304,485,5 ★,4★,High,High,4 ★,76,86,93,88,64,78,207
4,190871,Neymar Jr,Neymar da Silva Santos Jr.,https://cdn.sofifa.com/players/190/871/21_60.png,http://sofifa.com/player/190871/neymar-da-silv...,Brazil,28,91,91,\n\n\n\nParis Saint-Germain,2017 ~ 2022,"LW, CAM",175cm,68kg,Right,91,LW,"Aug 3, 2017",,€132M,€270K,€166.5M,408,85,87,62,87,87,448,95,88,89,81,95,453,94,89,96,91,83,357,80,62,81,50,84,356,51,36,87,90,92,93,94,35,30,29,59,9,9,15,15,11,2175,451,5 ★,5★,High,Medium,5 ★,91,85,86,94,36,59,595


In [87]:
print("This dataset has {} samples and {} features.".format(fifa21.shape[0], fifa21.shape[1]))

This dataset has 18979 samples and 77 features.


In [88]:
# Check for missing values
missing_values = fifa21.isnull().sum()
missing_values

ID                      0
Name                    0
LongName                0
photoUrl                0
playerUrl               0
Nationality             0
Age                     0
↓OVA                    0
POT                     0
Club                    0
Contract                0
Positions               0
Height                  0
Weight                  0
Preferred Foot          0
BOV                     0
Best Position           0
Joined                  0
Loan Date End       17966
Value                   0
Wage                    0
Release Clause          0
Attacking               0
Crossing                0
Finishing               0
Heading Accuracy        0
Short Passing           0
Volleys                 0
Skill                   0
Dribbling               0
Curve                   0
FK Accuracy             0
Long Passing            0
Ball Control            0
Movement                0
Acceleration            0
Sprint Speed            0
Agility                 0
Reactions   

In [89]:
# Check for features with more than 50% missing values and drop as they are not useful
fifa21 = fifa21.drop(columns=missing_values[missing_values > fifa21.shape[0]/2].index) 

# Check for features with only one unique value
unique_values = fifa21.nunique()
unique_values[unique_values == 1]
print("There are {} features with only one unique value.".format(unique_values[unique_values == 1].shape[0]))

There are 0 features with only one unique value.


In [90]:
# Hits column
# Convert Hits to numeric type but first check the data type
fifa21['Hits'].dtype

# It has a dtype[O] which means it is a string. We will convert it to numeric type
# Convert Hits to numeric type
fifa21['Hits'] = pd.to_numeric(fifa21['Hits'], errors='coerce')

In [91]:
# Now we can check for negative values
fifa21[fifa21['Hits'] < 0]
print("There are {} negative values in the Hits column.".format(fifa21[fifa21['Hits'] < 0].shape[0]))

There are 0 negative values in the Hits column.


In [92]:
# Now we will impute the missing values in the Hits column with the median
fifa21['Hits'] = fifa21['Hits'].replace(np.nan, fifa21['Hits'].median())

In [93]:
# Club column
# Check current unique Club values
print("Before cleaning:")
print(fifa21['Club'].unique()[:5])

# Clean the Club column by removing \n and extra whitespace
fifa21['Club'] = fifa21['Club'].str.replace('\n', '').str.strip()

# Verify cleanup
print("\nAfter cleaning:")
print(fifa21['Club'].unique()[:5]) 

# Check if there are any missing values after cleaning
print("\nMissing values:", fifa21['Club'].isnull().sum())

Before cleaning:
['\n\n\n\nFC Barcelona' '\n\n\n\nJuventus' '\n\n\n\nAtlético Madrid'
 '\n\n\n\nManchester City' '\n\n\n\nParis Saint-Germain']

After cleaning:
['FC Barcelona' 'Juventus' 'Atlético Madrid' 'Manchester City'
 'Paris Saint-Germain']

Missing values: 0


In [94]:
# First inspect the Contract column format
print("Before cleaning:")
print(fifa21['Contract'].value_counts().head())

# Clean and standardize contract format
# Remove '~' and split into start/end years
fifa21['Contract Start'] = fifa21['Contract'].str.split('~').str[0].str.strip()
fifa21['Contract End'] = fifa21['Contract'].str.split('~').str[1].str.strip()

# Convert to consistent format
fifa21['Contract'] = fifa21['Contract Start'] + ' - ' + fifa21['Contract End']

# Drop temporary columns
fifa21 = fifa21.drop(['Contract Start', 'Contract End'], axis=1)

# Verify cleanup
print("\nAfter cleaning:")
print(fifa21['Contract'].value_counts().head())

Before cleaning:
Contract
2019 ~ 2021    1706
2020 ~ 2022    1445
2020 ~ 2021    1440
2019 ~ 2022    1236
2018 ~ 2021    1163
Name: count, dtype: int64

After cleaning:
Contract
2019 - 2021    1706
2020 - 2022    1445
2020 - 2021    1440
2019 - 2022    1236
2018 - 2021    1163
Name: count, dtype: int64


In [95]:
# Value and Wage

# Function to convert value strings to numeric
def convert_currency(value):
    # Remove € symbol and spaces
    value = value.replace('€', '').strip()
    
    # Convert K (thousands) to millions
    if 'K' in value:
        value = float(value.replace('K', '')) / 1000
    # Convert M (millions) directly
    elif 'M' in value:
        value = float(value.replace('M', ''))
    
    return value

# Clean Value column
fifa21['Value'] = fifa21['Value'].apply(convert_currency)

# Clean Wage column
fifa21['Wage'] = fifa21['Wage'].apply(convert_currency)

# Verify results
print("Value sample (in millions):", fifa21['Value'].head())
print("Wage sample (in millions):", fifa21['Wage'].head())

Value sample (in millions): 0    103.5
1     63.0
2    120.0
3    129.0
4    132.0
Name: Value, dtype: object
Wage sample (in millions): 0     0.56
1     0.22
2    0.125
3     0.37
4     0.27
Name: Wage, dtype: object


In [96]:
def clean_monetary_value(value):
    try:
        # Remove € and spaces
        value = value.replace('€', '').strip()
        
        # Convert to millions
        if 'K' in value:
            return float(value.replace('K', '')) / 1000
        elif 'M' in value:
            return float(value.replace('M', ''))
        else:
            return float(value)
    except:
        return None

# Clean monetary columns
monetary_columns = ['Release Clause']
for col in monetary_columns:
    fifa21[col] = fifa21[col].apply(clean_monetary_value)

# Verify results
for col in monetary_columns:
    print(f"\n{col} sample (in millions):")
    print(fifa21[col].head())
    print(f"Missing values in {col}:", fifa21[col].isnull().sum())


Release Clause sample (in millions):
0    138.4
1     75.9
2    159.4
3    161.0
4    166.5
Name: Release Clause, dtype: float64
Missing values in Release Clause: 0


In [97]:
# First check raw values
print("Original heights:", fifa21['Height'].head())
print("Original weights:", fifa21['Weight'].head())

def standardize_height(height_str):
    try:
        # Remove 'cm' and convert to float
        return float(height_str.replace('cm', ''))
    except:
        return None

def standardize_weight(weight_str):
    try:
        if 'kg' in weight_str:
            return float(weight_str.replace('kg', ''))
        elif 'lbs' in weight_str:
            # Convert lbs to kg
            lbs = float(weight_str.replace('lbs', ''))
            return round(lbs * 0.45359237, 1)
    except:
        return None

# Apply standardization
fifa21['Height'] = fifa21['Height'].apply(standardize_height)
fifa21['Weight'] = fifa21['Weight'].apply(standardize_weight)

# Verify results
print("\nStandardized heights (cm):", fifa21['Height'].head())
print("\nStandardized weights (kg):", fifa21['Weight'].head())

Original heights: 0    170cm
1    187cm
2    188cm
3    181cm
4    175cm
Name: Height, dtype: object
Original weights: 0    72kg
1    83kg
2    87kg
3    70kg
4    68kg
Name: Weight, dtype: object

Standardized heights (cm): 0    170.0
1    187.0
2    188.0
3    181.0
4    175.0
Name: Height, dtype: float64

Standardized weights (kg): 0    72.0
1    83.0
2    87.0
3    70.0
4    68.0
Name: Weight, dtype: float64


In [98]:
fifa21.head()

Unnamed: 0,ID,Name,LongName,photoUrl,playerUrl,Nationality,Age,↓OVA,POT,Club,Contract,Positions,Height,Weight,Preferred Foot,BOV,Best Position,Joined,Value,Wage,Release Clause,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits
0,158023,L. Messi,Lionel Messi,https://cdn.sofifa.com/players/158/023/21_60.png,http://sofifa.com/player/158023/lionel-messi/2...,Argentina,33,93,93,FC Barcelona,2004 - 2021,"RW, ST, CF",170.0,72.0,Left,93,RW,"Jul 1, 2004",103.5,0.56,138.4,429,85,95,70,91,88,470,96,93,94,91,96,451,91,80,91,94,95,389,86,68,72,69,94,347,44,40,93,95,75,96,91,32,35,24,54,6,11,15,14,8,2231,466,4 ★,4★,Medium,Low,5 ★,85,92,91,95,38,65,771.0
1,20801,Cristiano Ronaldo,C. Ronaldo dos Santos Aveiro,https://cdn.sofifa.com/players/020/801/21_60.png,http://sofifa.com/player/20801/c-ronaldo-dos-s...,Portugal,35,92,92,Juventus,2018 - 2022,"ST, LW",187.0,83.0,Right,92,ST,"Jul 10, 2018",63.0,0.22,75.9,437,84,95,90,82,86,414,88,81,76,77,92,431,87,91,87,95,71,444,94,95,84,78,93,353,63,29,95,82,84,95,84,28,32,24,58,7,11,15,14,11,2221,464,4 ★,5★,High,Low,5 ★,89,93,81,89,35,77,562.0
2,200389,J. Oblak,Jan Oblak,https://cdn.sofifa.com/players/200/389/21_60.png,http://sofifa.com/player/200389/jan-oblak/210006/,Slovenia,27,91,93,Atlético Madrid,2014 - 2023,GK,188.0,87.0,Right,91,GK,"Jul 16, 2014",120.0,0.125,159.4,95,13,11,15,43,13,109,12,13,14,40,30,307,43,60,67,88,49,268,59,78,41,78,12,140,34,19,11,65,11,68,57,27,12,18,437,87,92,78,90,90,1413,489,3 ★,1★,Medium,Medium,3 ★,87,92,78,90,52,90,150.0
3,192985,K. De Bruyne,Kevin De Bruyne,https://cdn.sofifa.com/players/192/985/21_60.png,http://sofifa.com/player/192985/kevin-de-bruyn...,Belgium,29,91,91,Manchester City,2015 - 2023,"CAM, CM",181.0,70.0,Right,91,CAM,"Aug 30, 2015",129.0,0.37,161.0,407,94,82,55,94,82,441,88,85,83,93,92,398,77,76,78,91,76,408,91,63,89,74,91,408,76,66,88,94,84,91,186,68,65,53,56,15,13,5,10,13,2304,485,5 ★,4★,High,High,4 ★,76,86,93,88,64,78,207.0
4,190871,Neymar Jr,Neymar da Silva Santos Jr.,https://cdn.sofifa.com/players/190/871/21_60.png,http://sofifa.com/player/190871/neymar-da-silv...,Brazil,28,91,91,Paris Saint-Germain,2017 - 2022,"LW, CAM",175.0,68.0,Right,91,LW,"Aug 3, 2017",132.0,0.27,166.5,408,85,87,62,87,87,448,95,88,89,81,95,453,94,89,96,91,83,357,80,62,81,50,84,356,51,36,87,90,92,93,94,35,30,29,59,9,9,15,15,11,2175,451,5 ★,5★,High,Medium,5 ★,91,85,86,94,36,59,595.0


In [99]:
# Check original values
print("Original values:")
print("W/F sample:", fifa21['W/F'].head())
print("SM sample:", fifa21['SM'].head())
print("IR sample:", fifa21['IR'].head())

# Function to clean star ratings
def clean_stars(value):
    try:
        # Remove star symbol and convert to numeric
        return float(str(value).replace('★', '').strip())
    except:
        return None

# Clean star rating columns
star_columns = ['W/F', 'SM', 'IR']
for col in star_columns:
    fifa21[col] = fifa21[col].apply(clean_stars)

# Verify results
print("\nCleaned values:")
for col in star_columns:
    print(f"\n{col} ratings:", fifa21[col].head())
    print(f"Value counts for {col}:\n", fifa21[col].value_counts().sort_index())

Original values:
W/F sample: 0    4 ★
1    4 ★
2    3 ★
3    5 ★
4    5 ★
Name: W/F, dtype: object
SM sample: 0    4★
1    5★
2    1★
3    4★
4    5★
Name: SM, dtype: object
IR sample: 0    5 ★
1    5 ★
2    3 ★
3    4 ★
4    5 ★
Name: IR, dtype: object

Cleaned values:

W/F ratings: 0    4.0
1    4.0
2    3.0
3    5.0
4    5.0
Name: W/F, dtype: float64
Value counts for W/F:
 W/F
1.0      138
2.0     4141
3.0    11695
4.0     2722
5.0      283
Name: count, dtype: int64

SM ratings: 0    4.0
1    5.0
2    1.0
3    4.0
4    5.0
Name: SM, dtype: float64
Value counts for SM:
 SM
1.0    2075
2.0    9142
3.0    6577
4.0    1130
5.0      55
Name: count, dtype: int64

IR ratings: 0    5.0
1    5.0
2    3.0
3    4.0
4    5.0
Name: IR, dtype: float64
Value counts for IR:
 IR
1.0    17629
2.0     1018
3.0      281
4.0       45
5.0        6
Name: count, dtype: int64


In [100]:
# Names and long names

# Display sample of original values
print("Original values:")
print("Name sample:", fifa21['Name'].head())
print("LongName sample:", fifa21['LongName'].head())

# Clean name columns
name_columns = ['Name', 'LongName']
for col in name_columns:
    # Remove periods and strip whitespace
    fifa21[col] = fifa21[col].str.replace('.', '').str.strip()

# Verify results
print("\nCleaned values:")
for col in name_columns:
    print(f"\n{col} sample:", fifa21[col].head())
    print(f"Sample unique values in {col}:\n", fifa21[col].nunique())

Original values:
Name sample: 0             L. Messi
1    Cristiano Ronaldo
2             J. Oblak
3         K. De Bruyne
4            Neymar Jr
Name: Name, dtype: object
LongName sample: 0                    Lionel Messi
1    C. Ronaldo dos Santos Aveiro
2                       Jan Oblak
3                 Kevin De Bruyne
4      Neymar da Silva Santos Jr.
Name: LongName, dtype: object

Cleaned values:

Name sample: 0              L Messi
1    Cristiano Ronaldo
2              J Oblak
3          K De Bruyne
4            Neymar Jr
Name: Name, dtype: object
Sample unique values in Name:
 17920

LongName sample: 0                   Lionel Messi
1    C Ronaldo dos Santos Aveiro
2                      Jan Oblak
3                Kevin De Bruyne
4      Neymar da Silva Santos Jr
Name: LongName, dtype: object
Sample unique values in LongName:
 18852


In [103]:
fifa21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 76 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                18979 non-null  int64  
 1   Name              18979 non-null  object 
 2   LongName          18979 non-null  object 
 3   photoUrl          18979 non-null  object 
 4   playerUrl         18979 non-null  object 
 5   Nationality       18979 non-null  object 
 6   Age               18979 non-null  int64  
 7   ↓OVA              18979 non-null  int64  
 8   POT               18979 non-null  int64  
 9   Club              18979 non-null  object 
 10  Contract          18979 non-null  object 
 11  Positions         18979 non-null  object 
 12  Height            18939 non-null  float64
 13  Weight            18979 non-null  float64
 14  Preferred Foot    18979 non-null  object 
 15  BOV               18979 non-null  int64  
 16  Best Position     18979 non-null  object

In [102]:
# The contract column accumulated some nans after cleaning. We will replace them with the most frequent contract
fifa21['Contract'] = fifa21['Contract'].replace(np.nan, fifa21['Contract'].mode()[0])

# Check for missing values
missing_values = fifa21.isnull().sum()

In [105]:
# Create mapping of old to new column names
column_renames = {
    'Height': 'Height (cm)',
    'Weight': 'Weight (kg)',
    'Value': 'Value (M€)',
    'Wage': 'Wage (M€)',
    'Release Clause': 'Release Clause (M€)'
}

# Rename columns
fifa21 = fifa21.rename(columns=column_renames)

# Verify new column names
print("Updated columns:")
print(fifa21[list(column_renames.values())].head())

Updated columns:
   Height (cm)  Weight (kg) Value (M€) Wage (M€)  Release Clause (M€)
0        170.0         72.0      103.5      0.56                138.4
1        187.0         83.0       63.0      0.22                 75.9
2        188.0         87.0      120.0     0.125                159.4
3        181.0         70.0      129.0      0.37                161.0
4        175.0         68.0      132.0      0.27                166.5


In [106]:
# Create mapping for star rating columns
star_column_renames = {
    'W/F': 'Weak Foot (★)',
    'SM': 'Skill Moves (★)',
    'IR': 'International Reputation (★)'
}

# Rename columns
fifa21 = fifa21.rename(columns=star_column_renames)

# Verify new column names
print("Updated star rating columns:")
print(fifa21[list(star_column_renames.values())].head())

Updated star rating columns:
   Weak Foot (★)  Skill Moves (★)  International Reputation (★)
0            4.0              4.0                           5.0
1            4.0              5.0                           5.0
2            3.0              1.0                           3.0
3            5.0              4.0                           4.0
4            5.0              5.0                           5.0


In [109]:
# Display sample of original values
print("Original Joined dates:")
print(fifa21['Joined'].head())

# Convert to datetime with consistent format
fifa21['Joined'] = pd.to_datetime(fifa21['Joined'])

# Format as YYYY-MM-DD
fifa21['Joined'] = fifa21['Joined'].dt.strftime('%Y-%m-%d')

# Rename column to indicate format
fifa21 = fifa21.rename(columns={'Joined': 'Joined (YYYY-MM-DD)'})

# Verify results
print("\nStandardized dates:")
print(fifa21['Joined (YYYY-MM-DD)'].head())
print("\nValue counts:")
print(fifa21['Joined (YYYY-MM-DD)'].value_counts().head())

Original Joined dates:
0     Jul 1, 2004
1    Jul 10, 2018
2    Jul 16, 2014
3    Aug 30, 2015
4     Aug 3, 2017
Name: Joined, dtype: object

Standardized dates:
0    2004-07-01
1    2018-07-10
2    2014-07-16
3    2015-08-30
4    2017-08-03
Name: Joined (YYYY-MM-DD), dtype: object

Value counts:
Joined (YYYY-MM-DD)
2019-07-01    1344
2018-07-01     865
2019-01-01     682
2017-07-01     520
2020-07-01     435
Name: count, dtype: int64


In [118]:
fifa21.head()

Unnamed: 0,ID,Name,LongName,photoUrl,playerUrl,Nationality,Age,↓OVA,POT,Club,Contract,Positions,Height (cm),Weight (kg),Preferred Foot,BOV,Best Position,Joined (YYYY-MM-DD),Value (M€),Wage (M€),Release Clause (M€),Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,Weak Foot (★),Skill Moves (★),Attacking Work Rate,Defensive Work Rate,International Reputation (★),PAC,SHO,PAS,DRI,DEF,PHY,Hits
0,158023,L Messi,Lionel Messi,https://cdn.sofifa.com/players/158/023/21_60.png,http://sofifa.com/player/158023/lionel-messi/2...,Argentina,33,93,93,FC Barcelona,2004 - 2021,"RW, ST, CF",170.0,72.0,Left,93,RW,2004-07-01,103.5,0.56,138.4,429,85,95,70,91,88,470,96,93,94,91,96,451,91,80,91,94,95,389,86,68,72,69,94,347,44,40,93,95,75,96,91,32,35,24,54,6,11,15,14,8,2231,466,4.0,4.0,Medium,Low,5.0,85,92,91,95,38,65,771.0
1,20801,Cristiano Ronaldo,C Ronaldo dos Santos Aveiro,https://cdn.sofifa.com/players/020/801/21_60.png,http://sofifa.com/player/20801/c-ronaldo-dos-s...,Portugal,35,92,92,Juventus,2018 - 2022,"ST, LW",187.0,83.0,Right,92,ST,2018-07-10,63.0,0.22,75.9,437,84,95,90,82,86,414,88,81,76,77,92,431,87,91,87,95,71,444,94,95,84,78,93,353,63,29,95,82,84,95,84,28,32,24,58,7,11,15,14,11,2221,464,4.0,5.0,High,Low,5.0,89,93,81,89,35,77,562.0
2,200389,J Oblak,Jan Oblak,https://cdn.sofifa.com/players/200/389/21_60.png,http://sofifa.com/player/200389/jan-oblak/210006/,Slovenia,27,91,93,Atlético Madrid,2014 - 2023,GK,188.0,87.0,Right,91,GK,2014-07-16,120.0,0.125,159.4,95,13,11,15,43,13,109,12,13,14,40,30,307,43,60,67,88,49,268,59,78,41,78,12,140,34,19,11,65,11,68,57,27,12,18,437,87,92,78,90,90,1413,489,3.0,1.0,Medium,Medium,3.0,87,92,78,90,52,90,150.0
3,192985,K De Bruyne,Kevin De Bruyne,https://cdn.sofifa.com/players/192/985/21_60.png,http://sofifa.com/player/192985/kevin-de-bruyn...,Belgium,29,91,91,Manchester City,2015 - 2023,"CAM, CM",181.0,70.0,Right,91,CAM,2015-08-30,129.0,0.37,161.0,407,94,82,55,94,82,441,88,85,83,93,92,398,77,76,78,91,76,408,91,63,89,74,91,408,76,66,88,94,84,91,186,68,65,53,56,15,13,5,10,13,2304,485,5.0,4.0,High,High,4.0,76,86,93,88,64,78,207.0
4,190871,Neymar Jr,Neymar da Silva Santos Jr,https://cdn.sofifa.com/players/190/871/21_60.png,http://sofifa.com/player/190871/neymar-da-silv...,Brazil,28,91,91,Paris Saint-Germain,2017 - 2022,"LW, CAM",175.0,68.0,Right,91,LW,2017-08-03,132.0,0.27,166.5,408,85,87,62,87,87,448,95,88,89,81,95,453,94,89,96,91,83,357,80,62,81,50,84,356,51,36,87,90,92,93,94,35,30,29,59,9,9,15,15,11,2175,451,5.0,5.0,High,Medium,5.0,91,85,86,94,36,59,595.0


In [114]:
missing_values = fifa21.isnull().sum()
missing_values

ID                              0
Name                            0
LongName                        0
photoUrl                        0
playerUrl                       0
Nationality                     0
Age                             0
↓OVA                            0
POT                             0
Club                            0
Contract                        0
Positions                       0
Height (cm)                     0
Weight (kg)                     0
Preferred Foot                  0
BOV                             0
Best Position                   0
Joined (YYYY-MM-DD)             0
Value (M€)                      0
Wage (M€)                       0
Release Clause (M€)             0
Attacking                       0
Crossing                        0
Finishing                       0
Heading Accuracy                0
Short Passing                   0
Volleys                         0
Skill                           0
Dribbling                       0
Curve         

In [113]:
# Height has accumulated some missing values after cleaning. We will replace them with the median
fifa21['Height (cm)'] = fifa21['Height (cm)'].replace(np.nan, fifa21['Height (cm)'].median())

# Check for missing values
missing_values = fifa21.isnull().sum()

In [117]:
# Create mapping for attack/defense work rate columns
workrate_renames = {
    'A/W': 'Attacking Work Rate',
    'D/W': 'Defensive Work Rate'
}

# Rename columns
fifa21 = fifa21.rename(columns=workrate_renames)

# Verify new column names
print("Updated work rate columns:")
print(fifa21[list(workrate_renames.values())].head())
print("\nValue counts:")
for col in workrate_renames.values():
    print(f"\n{col}:\n", fifa21[col].value_counts())

Updated work rate columns:
  Attacking Work Rate Defensive Work Rate
0              Medium                 Low
1                High                 Low
2              Medium              Medium
3                High                High
4                High              Medium

Value counts:

Attacking Work Rate:
 Attacking Work Rate
Medium    12701
High       5288
Low         990
Name: count, dtype: int64

Defensive Work Rate:
 Defensive Work Rate
Medium    13956
High       3297
Low        1726
Name: count, dtype: int64


In [119]:
# Save the cleaned dataset
fifa21.to_csv('cleaned_fifa21.csv', index=False)