In [279]:
import pandas as pd

## Prepare dataframe

In [280]:
auto_df = pd.read_csv('../../datasets/auto.csv')
auto_df.set_index('ID', inplace=True)
auto_df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [281]:
auto_df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## Drop the duplicates

In [282]:
columns = ['CarNumber', 'Make_n_model', 'Fines']

auto_df.drop_duplicates(columns,keep='last', inplace=True)
auto_df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [283]:
auto_df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## Fix missing values

In [284]:
missing = auto_df.isna().sum()
missing

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [285]:
filt = missing[(missing > 500) & missing.index.isin(auto_df.columns)].index
auto_df.drop(columns=filt, inplace=True)
auto_df


Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [286]:
auto_df.isna().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [287]:
auto_df['Refund'] = auto_df['Refund'].ffill()
auto_df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [288]:
auto_df.isna().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [289]:
mean = auto_df['Fines'].mean(skipna=True)
auto_df.fillna(mean, inplace=True)
auto_df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000
1,E432XX77RUS,Toyota Camry,1.0,6500.000000
2,7184TT36RUS,Ford Focus,1.0,2100.000000
3,X582HE161RUS,Ford Focus,2.0,2000.000000
5,92918M178RUS,Ford Focus,1.0,5700.000000
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000
927,M0309X197RUS,Ford Focus,1.0,22300.000000
928,O673E8197RUS,Ford Focus,2.0,600.000000
929,8610T8154RUS,Ford Focus,1.0,2000.000000


In [290]:
auto_df.isna().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## Split and parse the make and model

In [291]:
auto_df[['Make', 'Model']] = auto_df['Make_n_model'].str.split(' ', expand=True)
auto_df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,Toyota Camry,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,Ford Focus,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,Ford Focus,2.0,2000.000000,Ford,Focus
5,92918M178RUS,Ford Focus,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,Ford Focus,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,Ford Focus,2.0,600.000000,Ford,Focus
929,8610T8154RUS,Ford Focus,1.0,2000.000000,Ford,Focus


In [292]:
auto_df.drop(columns='Make_n_model', inplace=True)
auto_df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


In [293]:
auto_df.to_json('auto.json',orient='records', indent=4)