# Settings 

In [160]:
import pandas as pd
import numpy as np

# Import datasets

Let's use a dict to store our dataframes

In [103]:
def load_data_as_dic():
    res = {
        'train': pd.read_csv('./data/Training_Dataset.csv', sep=';', index_col=0, dtype={'zip_code': str}),
        'test': pd.read_csv('./data/Test_Dataset.csv', sep=';', index_col=0, dtype={'zip_code': str}),
    }
    return res

In [104]:
data_dict = load_data_as_dic()

# Working with the dataframe

In [186]:
def cleanse_data(df):
    # Deep copy of original dataframe
    df = df.copy(deep=True)
    
    # Make categorical data types
    for i in ['legal_form', 'type_pl', 'sector']:
        df[i] = df[i].astype('category')
    
    # Handle leading zeros in zip code
    idx = df['zip_code'].apply(lambda x: len(x)) == 4
    df.loc[idx, 'zip_code'] = '0' + df.loc[idx, 'zip_code']
    
    # Drop C_043 in the training dataset (all variables NA)
    df = df.drop(index='C_043', errors='ignore')
    
    # Replace NaN in PL items with 0
    for i in ['earn_from_op', 'fin_result', 'oth_interest_inc', 'oth_interest_exp', 
              'result_from_ord_bus', 'total_result', 'annual_profit']:
        df[i] = df[i].fillna(0.00)
        
    # Set gross perforamance equal to 1 if missing
    df['gross_performance'] = df['gross_performance'].fillna(df['sales'])
    df['gross_profit'] = df['gross_profit'].fillna((df['gross_profit'] / df['sales']).median() * df['sales'])    
    
    # Return cleansed dataframe
    return df

In [189]:
df_train = cleanse_data(data_dict.get('train'))

In [190]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 668 entries, C_120 to C_647
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   default                  668 non-null    int64   
 1   legal_form               668 non-null    category
 2   zip_code                 668 non-null    object  
 3   sector                   668 non-null    category
 4   year_inc                 668 non-null    int64   
 5   count_emp                312 non-null    float64 
 6   type_pl                  668 non-null    category
 7   sales                    668 non-null    float64 
 8   gross_performance        668 non-null    float64 
 9   gross_profit             668 non-null    float64 
 10  earn_from_op             668 non-null    float64 
 11  fin_result               668 non-null    float64 
 12  oth_interest_inc         668 non-null    float64 
 13  oth_interest_exp         668 non-null    float64 
 14  result_fr