In [14]:
import pandas as pd 
import re

# Cleaning Data using Pipeline

### Step 1- create a function that will load our data 

In [31]:
def load_data():
    df = pd.read_csv('/home/hp/fifa21_raw_data.csv', low_memory=False)
    return df

### Step 2- create a function that will clean the data 

In [None]:
def clean_data(data):
    # Convert string values into numerical values
    data['Height'] = data['Height'].apply(lambda x: x.replace("'", "").replace('"', "")).astype(float)
    data['Weight'] = data['Weight'].apply(lambda x: x.replace("lbs", "")).astype(int) 
    # convert columns to datetime
    data['Joined'] = pd.to_datetime(data['Joined'], errors='coerce')
    # convert Wage and Value int 
    def convert_value(val):
        val = val.replace("€", "").strip()
        if "M" in val:
            val = float(val.replace("M", "")) * 1_000_000
        elif "K" in val:
            val = float(val.replace("K", "")) * 1_000
        return float(val)
    data['Value'] = data['Value'].apply(convert_value)
    data['Wage'] = [int(x.replace("€", "").replace("K", "")) * 1_000 for x in data['Wage']]
    data['Team & Contract'] = data['Team & Contract'].apply(lambda x: x.replace("\n", " ").strip())
    data['Hits'] = data['Hits'].apply(lambda x: x.replace("K", "").strip()).astype(float)
    # convert release and clause columns 
    data['Release Clause'] = [ float(x.replace('€', '').replace('K', '')) * 1_000 if "K" in x else
                              float(x.replace('€', '').replace('M', '')) * 1_000_000 if "M" in x else 0
                              for x in data['Release Clause']]
    # remove empty space from this columns
    data['Team & Contract'] = data['Team & Contract'].apply(lambda x: x.replace('\n', '').strip())
    # separate a the Team and contract columns
    # 1- conract columns
    data['Conract'] = [ re.search(r'\d+\s~\s\d+', i).group() if re.search(r'\d+\s~\s\d+', i) else ''
    for i in data['Team & Contract']]
    # Team columns
    data['Team'] = [ re.search(r'^[^\d]+', i).group() if re.search(r'^[^\d]+', i) else ''
    for i in data['Team & Contract']]
    return data 

### Step 3- save the data 

In [41]:
# save cleaned data
def save_cleaned_dt(data):
    data.to_csv('fifa_cleaned_data.csv', index=False)

### Step 4- Run the Pipeline

In [42]:
# run pipeline

raw_data = load_data()
cleaned_data = clean_data(raw_data)
save_cleaned_dt(cleaned_data)
