In [4]:
import pandas as pd
import numpy as np
import requests

## read the JSON file that you saved in ex02

In [1]:
dataset_link = "https://drive.google.com/file/d/1djlN-ujJm1IL6j-ePaygIMxNHKf8P7D9/view?usp=sharing"
file_id = dataset_link.split("/")[-2]
!gdown {file_id}
!ls

Downloading...
From: https://drive.google.com/uc?id=1djlN-ujJm1IL6j-ePaygIMxNHKf8P7D9
To: /Users/dhawkgir/ds/day05/ex04/auto.json
100%|██████████████████████████████████████| 64.1k/64.1k [00:00<00:00, 1.01MB/s]
auto.json        enrichment.ipynb


In [2]:
file_name = 'auto.json'

In [5]:
df = pd.read_json(file_name, orient='records')

In [55]:
pd.options.display.float_format = '{:.2f}'.format

In [69]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  725 non-null    object 
 1   Refund     725 non-null    int64  
 2   Fines      725 non-null    float64
 3   Make       725 non-null    object 
 4   Model      716 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 28.4+ KB


## enrich the dataframe using a sample from that dataframe

In [70]:
samples = 200
random_state = 21

In [117]:
np.random.seed(random_state)

In [118]:
enrich_df = df.iloc[np.random.randint(0, len(df), samples)]
enrich_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
207,Y351O8197RUS,1,1500.00,Ford,Focus
48,H917TC36RUS,2,4000.00,Ford,Focus
368,C589EY154RUS,1,4500.00,Ford,Focus
120,K846YE77RUS,2,2000.00,Volkswagen,Passat
419,X4108H125RUS,2,1300.00,Ford,Focus
...,...,...,...,...,...
587,M942OT152RUS,1,2000.00,Ford,Focus
595,Y187O8161RUS,2,400.00,Ford,Focus
365,7064C8197RUS,1,12800.00,Volkswagen,Passat
474,8437XX154RUS,2,800.00,Ford,Focus


In [119]:
pd.set_option('mode.chained_assignment', None)

In [120]:
enrich_df.loc[:, ['Refund']] = np.random.randint(df.Refund.min(), df.Refund.max() + 1, samples)
enrich_df.loc[:, ['Fines']] = df.Fines.max() * np.random.random(samples)
enrich_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
207,Y351O8197RUS,2,168558.07,Ford,Focus
48,H917TC36RUS,2,135334.21,Ford,Focus
368,C589EY154RUS,1,102219.25,Ford,Focus
120,K846YE77RUS,2,64799.03,Volkswagen,Passat
419,X4108H125RUS,2,118728.93,Ford,Focus
...,...,...,...,...,...
587,M942OT152RUS,1,174828.71,Ford,Focus
595,Y187O8161RUS,1,178191.32,Ford,Focus
365,7064C8197RUS,2,63548.89,Volkswagen,Passat
474,8437XX154RUS,2,147950.60,Ford,Focus


In [121]:
concat_rows = pd.concat([df, enrich_df], ignore_index=True)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,M942OT152RUS,1,174828.71,Ford,Focus
921,Y187O8161RUS,1,178191.32,Ford,Focus
922,7064C8197RUS,2,63548.89,Volkswagen,Passat
923,8437XX154RUS,2,147950.60,Ford,Focus


## enrich the dataframe concat_rows by a new column with the data generated

In [122]:
np.random.seed(random_state)

In [127]:
Year = pd.Series(np.random.randint(1980, 2019+1, len(concat_rows)), name='Year')
Year

0      1994
1      2007
2      1980
3      2006
4      1983
       ... 
920    2005
921    2013
922    2018
923    2003
924    2009
Name: Year, Length: 925, dtype: int64

In [131]:
fines = concat_rows.merge(Year, left_index=True, right_index=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1994
1,E432XX77RUS,1,6500.00,Toyota,Camry,2007
2,7184TT36RUS,1,2100.00,Ford,Focus,1980
3,X582HE161RUS,2,2000.00,Ford,Focus,2006
4,92918M178RUS,1,5700.00,Ford,Focus,1983
...,...,...,...,...,...,...
920,M942OT152RUS,1,174828.71,Ford,Focus,2005
921,Y187O8161RUS,1,178191.32,Ford,Focus,2013
922,7064C8197RUS,2,63548.89,Volkswagen,Passat,2018
923,8437XX154RUS,2,147950.60,Ford,Focus,2003


## enrich the dataframe with the data from another dataframe

In [132]:
surname_link = 'https://drive.google.com/file/d/1uulthlFMcF1S29sxDw_9AWKXf7OhaamX/view?usp=sharing'
file_id = surname_link.split("/")[-2]
!gdown {file_id}
!ls

Downloading...
From: https://drive.google.com/uc?id=1uulthlFMcF1S29sxDw_9AWKXf7OhaamX
To: /Users/dhawkgir/ds/day05/ex04/surname.json
100%|██████████████████████████████████████| 2.62k/2.62k [00:00<00:00, 7.79MB/s]
auto.json        enrichment.ipynb surname.json


In [139]:
!head surname.json

[["NAME","COUNT","RANK"],
["ADAMS","427865","42"],
["ALLEN","482607","33"],
["ALVAREZ","233983","92"],
["ANDERSON","784404","15"],
["BAILEY","277845","72"],
["BAKER","419586","44"],
["BENNETT","247599","86"],
["BROOKS","251663","82"],
["BROWN","1437026","4"],


In [153]:
surname_file = 'surname.json'
df_sn = pd.read_json(surname_file, orient='records')
df_sn

Unnamed: 0,0,1,2
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


Unnamed: 0,0,1,2
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


## create a pivot table from the fines dataframe

## save both the fines and owners dataframes to CSV files without an index