# Ex02 Preprocessing

In [1]:
import pandas as pd

## load csv

In [28]:
data = pd.read_csv('data/auto.csv', index_col = 'ID')
data.head()

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,


## counts

In [29]:
data.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## drop duplicates

In [30]:
data = data.drop_duplicates(['CarNumber', 'Make_n_model', 'Fines'], keep = 'last')

In [31]:
data.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## preprocess nans

In [32]:
data.isna().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [33]:
data = data.dropna(thresh = 500, axis = 'columns')

In [34]:
data.isna().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [35]:
data.Refund.fillna(method='ffill', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.Refund.fillna(method='ffill', inplace = True)
  data.Refund.fillna(method='ffill', inplace = True)


In [36]:
data.isna().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [82]:
data['Fines'] = data.Fines.fillna(data[data.Fines.notna()].Fines.mean())

In [83]:
data.isna().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## separate and analyze make and model

In [92]:
data['Make'] = data.Make_n_model.apply(lambda x: x.split()[0])

In [93]:
data['Model'] = data.Make_n_model.apply(lambda x: ''.join(x.split()[1:]))

In [95]:
data.drop(columns = ['Make_n_model'], inplace = True)

In [97]:
data.head()

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus


## save to json

In [99]:
data.to_json('data/auto.json', orient='records')