In [19]:
import numpy as np
import pandas as pd
import requests

pd.options.display.float_format = '{:.2f}'.format


df = pd.read_json('../data/auto.json')
df = df[['CarNumber', 'Make', 'Model', 'Refund', 'Fines']]
df

Unnamed: 0,CarNumber,Make,Model,Refund,Fines
0,Y163O8161RUS,Ford,Focus,2,3200.00
1,E432XX77RUS,Toyota,Camry,1,6500.00
2,7184TT36RUS,Ford,Focus,1,2100.00
3,X582HE161RUS,Ford,Focus,2,2000.00
4,92918M178RUS,Ford,Focus,1,5700.00
...,...,...,...,...,...
720,Y163O8161RUS,Ford,Focus,2,1600.00
721,M0309X197RUS,Ford,Focus,1,22300.00
722,O673E8197RUS,Ford,Focus,2,600.00
723,8610T8154RUS,Ford,Focus,1,2000.00


In [2]:
missing_val = df['Model'].isnull().count()
missing_val

np.int64(725)

In [3]:
random_state = 21
car_info = df[['CarNumber','Make','Model']].drop_duplicates()
sampled_car_info = car_info.sample(n=200,replace=True,random_state=random_state).reset_index(drop=True)
sampled_car_info

refund_sample = df['Refund'].sample(n=200,replace=True,random_state=random_state).reset_index(drop=True)
fines_sample = df['Fines'].sample(n=200,replace=True,random_state=random_state).reset_index(drop=True)

new_sample = pd.concat([sampled_car_info,refund_sample,fines_sample],axis=1)
new_sample.columns = ['CarNumber','Make','Model','Refund','Fines']
new_sample


Unnamed: 0,CarNumber,Make,Model,Refund,Fines
0,T6329O50RUS,Ford,Focus,1,1500.00
1,H917TC36RUS,Ford,Focus,2,4000.00
2,9763HY161RUS,Ford,Focus,1,4500.00
3,H003MH197RUS,Ford,Focus,2,2000.00
4,O64097197RUS,Ford,Focus,2,1300.00
...,...,...,...,...,...
195,X782CO96RUS,Ford,Focus,1,2000.00
196,E79988152RUS,Ford,Focus,2,400.00
197,X582HE161RUS,Ford,Focus,1,12800.00
198,9594HY161RUS,Ford,Focus,2,800.00


In [4]:
concat_rows = pd.concat([df, new_sample],ignore_index=True)
concat_rows


Unnamed: 0,CarNumber,Make,Model,Refund,Fines
0,Y163O8161RUS,Ford,Focus,2,3200.00
1,E432XX77RUS,Toyota,Camry,1,6500.00
2,7184TT36RUS,Ford,Focus,1,2100.00
3,X582HE161RUS,Ford,Focus,2,2000.00
4,92918M178RUS,Ford,Focus,1,5700.00
...,...,...,...,...,...
920,X782CO96RUS,Ford,Focus,1,2000.00
921,E79988152RUS,Ford,Focus,2,400.00
922,X582HE161RUS,Ford,Focus,1,12800.00
923,9594HY161RUS,Ford,Focus,2,800.00


In [5]:

np.random.seed(21)

year_series = pd.Series(np.random.randint(1980,2020,size = 925),name='Year')
year_series

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1981
921    1992
922    2007
923    2005
924    1997
Name: Year, Length: 925, dtype: int64

In [6]:
fines = pd.concat([concat_rows,year_series],axis=1)
fines.head(20)

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year
0,Y163O8161RUS,Ford,Focus,2,3200.0,1989
1,E432XX77RUS,Toyota,Camry,1,6500.0,1995
2,7184TT36RUS,Ford,Focus,1,2100.0,1984
3,X582HE161RUS,Ford,Focus,2,2000.0,2015
4,92918M178RUS,Ford,Focus,1,5700.0,2014
5,H234YH197RUS,Ford,Focus,2,6000.0,1990
6,E40577152RUS,Ford,Focus,1,8594.59,1988
7,707987163RUS,Ford,Focus,2,2200.0,2016
8,K330T8197RUS,Skoda,Octavia,2,8200.0,2018
9,X786CO96RUS,Ford,Focus,1,8594.59,2000


In [7]:
surname_df = pd.read_json('../data/surname.json')

surname_df = surname_df.iloc[1:].reset_index(drop=True)
surname_df.columns = ['Surname', 'Count', 'Rank']

surname_df['Count'] = surname_df['Count'].astype(int)
surname_df['Rank'] = surname_df['Rank'].astype(int)

unique_car_numbers = sampled_car_info['CarNumber'].unique()
surname_sample = surname_df['Surname'].sample(n=len(unique_car_numbers),replace=True).reset_index(drop=True)


In [8]:
owners = pd.DataFrame({
    'CarNumber': unique_car_numbers,
    'SURNAME': surname_sample
})
owners

Unnamed: 0,CarNumber,SURNAME
0,T6329O50RUS,BAKER
1,H917TC36RUS,CRUZ
2,9763HY161RUS,MARTIN
3,H003MH197RUS,REED
4,O64097197RUS,COOPER
...,...,...
160,E53277152RUS,HUGHES
161,T912CT197RUS,MILLER
162,X582HE161RUS,BAILEY
163,9594HY161RUS,GOMEZ


In [9]:
new_rows = pd.DataFrame([
    ["X2K321EE32R","BMW","M5",1,2400.00,2010],
    ["C5K3212d32R","Toyota","Corolla",2,4200.00,2014],
    ["32FGE325HCV","Honda","Civic",4,1200.00,2012],
    ["5DK3212d32R","Chevrolet","Malibu",3,6000.00,2018],
    ["GR33212d32R","Nissan","Altima",2,5600.00,2001],
],columns=['CarNumber','Make','Model','Refund','Fines','Year'])

new_rows

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year
0,X2K321EE32R,BMW,M5,1,2400.0,2010
1,C5K3212d32R,Toyota,Corolla,2,4200.0,2014
2,32FGE325HCV,Honda,Civic,4,1200.0,2012
3,5DK3212d32R,Chevrolet,Malibu,3,6000.0,2018
4,GR33212d32R,Nissan,Altima,2,5600.0,2001


In [10]:
fines = pd.concat([fines,new_rows],ignore_index=True)
fines

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year
0,Y163O8161RUS,Ford,Focus,2,3200.00,1989
1,E432XX77RUS,Toyota,Camry,1,6500.00,1995
2,7184TT36RUS,Ford,Focus,1,2100.00,1984
3,X582HE161RUS,Ford,Focus,2,2000.00,2015
4,92918M178RUS,Ford,Focus,1,5700.00,2014
...,...,...,...,...,...,...
925,X2K321EE32R,BMW,M5,1,2400.00,2010
926,C5K3212d32R,Toyota,Corolla,2,4200.00,2014
927,32FGE325HCV,Honda,Civic,4,1200.00,2012
928,5DK3212d32R,Chevrolet,Malibu,3,6000.00,2018


In [11]:
owners = owners.drop(owners.tail(20).index)
owners

Unnamed: 0,CarNumber,SURNAME
0,T6329O50RUS,BAKER
1,H917TC36RUS,CRUZ
2,9763HY161RUS,MARTIN
3,H003MH197RUS,REED
4,O64097197RUS,COOPER
...,...,...
140,M298CH161RUS,MORGAN
141,8611T8154RUS,SANDERS
142,K899T8197RUS,HARRIS
143,T6419O50RUS,HILL


In [12]:
new_own = pd.DataFrame([
    ['H917TC32RUS','ADAMS'],
    ['GW17TC22RUS','SMITH'],
    ['EW17TC21RUS','PETER'],
],columns=['CarNumber','SURNAME'])
new_own

Unnamed: 0,CarNumber,SURNAME
0,H917TC32RUS,ADAMS
1,GW17TC22RUS,SMITH
2,EW17TC21RUS,PETER


In [13]:
owners = pd.concat([owners,new_own],ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,T6329O50RUS,BAKER
1,H917TC36RUS,CRUZ
2,9763HY161RUS,MARTIN
3,H003MH197RUS,REED
4,O64097197RUS,COOPER
...,...,...
143,T6419O50RUS,HILL
144,9182CE154RUS,DIAZ
145,H917TC32RUS,ADAMS
146,GW17TC22RUS,SMITH


In [14]:
merged_df = pd.merge(fines,owners,on='CarNumber',how='inner')
merged_df

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year,SURNAME
0,Y163O8161RUS,Ford,Focus,2,3200.00,1989,MENDOZA
1,E432XX77RUS,Toyota,Camry,1,6500.00,1995,GRAY
2,H234YH197RUS,Ford,Focus,2,6000.00,1990,BROOKS
3,K330T8197RUS,Skoda,Octavia,2,8200.00,2018,GUTIERREZ
4,C477M7161RUS,Ford,Focus,1,2500.00,2000,RICHARDSON
...,...,...,...,...,...,...,...
374,X765HY197RUS,Ford,Focus,2,1600.00,2017,CRUZ
375,X782CO96RUS,Ford,Focus,2,72500.00,2001,BROOKS
376,M372CH197RUS,Ford,Focus,2,12000.00,1997,MOORE
377,X782CO96RUS,Ford,Focus,1,2000.00,1981,BROOKS


In [15]:
merged_left = pd.merge(fines,owners,on='CarNumber',how='left')
merged_left

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year,SURNAME
0,Y163O8161RUS,Ford,Focus,2,3200.00,1989,MENDOZA
1,E432XX77RUS,Toyota,Camry,1,6500.00,1995,GRAY
2,7184TT36RUS,Ford,Focus,1,2100.00,1984,
3,X582HE161RUS,Ford,Focus,2,2000.00,2015,
4,92918M178RUS,Ford,Focus,1,5700.00,2014,
...,...,...,...,...,...,...,...
925,X2K321EE32R,BMW,M5,1,2400.00,2010,
926,C5K3212d32R,Toyota,Corolla,2,4200.00,2014,
927,32FGE325HCV,Honda,Civic,4,1200.00,2012,
928,5DK3212d32R,Chevrolet,Malibu,3,6000.00,2018,


In [16]:
merged_right = pd.merge(fines,owners,on='CarNumber',how='right')
merged_right

Unnamed: 0,CarNumber,Make,Model,Refund,Fines,Year,SURNAME
0,T6329O50RUS,Ford,Focus,1.00,500.00,2017.00,BAKER
1,T6329O50RUS,Ford,Focus,1.00,1500.00,1985.00,BAKER
2,T6329O50RUS,Ford,Focus,1.00,43600.00,1996.00,BAKER
3,T6329O50RUS,Ford,Focus,1.00,100.00,2002.00,BAKER
4,H917TC36RUS,Ford,Focus,2.00,4000.00,1982.00,CRUZ
...,...,...,...,...,...,...,...
377,9182CE154RUS,Ford,Focus,2.00,13200.00,1996.00,DIAZ
378,9182CE154RUS,Ford,Focus,1.00,6000.00,1988.00,DIAZ
379,H917TC32RUS,,,,,,ADAMS
380,GW17TC22RUS,,,,,,SMITH


In [17]:
pivot_table = fines.pivot_table(
    values='Fines',
    index = ['Make','Model'],
    columns='Year',
    aggfunc='sum'
)
pivot_table

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BMW,M5,,,,,,,,,,,...,2400.0,,,,,,,,,
Chevrolet,Malibu,,,,,,,,,,,...,,,,,,,,,6000.0,
Ford,Focus,56489.17,398589.17,140383.76,62300.0,112494.59,189583.76,104994.59,132800.0,95489.17,125700.0,...,120183.76,86689.17,120200.0,149294.59,157494.59,210789.17,83694.59,268200.0,283594.59,117100.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,48100.0,,,
Honda,Civic,,,,,,,,,,,...,,,1200.0,,,,,,,
Nissan,Altima,,,,,,,,,,,...,,,,,,,,,,
Skoda,Octavia,1900.0,,6900.0,11594.59,,10294.59,600.0,5200.0,500.0,91400.0,...,3100.0,500.0,500.0,19594.59,3300.0,46394.59,300.0,4000.0,156200.0,9500.0
Toyota,Camry,28500.0,8594.59,,7200.0,,,,,,22400.0,...,,3300.0,10594.59,,,,,1000.0,13000.0,18100.0
Toyota,Corolla,,,2000.0,800.0,,,,8000.0,,4000.0,...,24000.0,8594.59,,,4200.0,,,9600.0,,
Volkswagen,Golf,30900.0,,,8594.59,300.0,24000.0,,9300.0,,5800.0,...,,300.0,,,,2300.0,,,,


In [18]:
fines.to_csv('../data/fines.csv',index=False)
owners.to_csv('../data/owners.csv',index=False)