## Exercise 04 : Enrichment and transformations

Import libraries

In [141]:
import pandas as pd
import numpy as np
import requests

pd.options.display.float_format = "{:.2f}".format  # float precision setting
np.random.seed(21)  # random seed = 21 for all jupyter blocks

* read the JSON file that you saved in ex02

  * one of the columns has the float type, so let us define the format of it in
  pandas using pd.options.display.float_format: floats should be displayed with
  two decimals
  * there are values missing from the Model, do not do anything with them

In [142]:
json_file_path = "../data/auto.json"
df = pd.read_json(path_or_buf=json_file_path)
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


* enrich the dataframe using a sample from that dataframe
  * create a sample with 200 new observations with random_state = 21 
    * the sample should not have new combinations of the car number,
    make and model, so the whole dataset will be consistent in these terms
    * there are no restrictions on the refund and fines, you can take
    any value
    from these columns at random and use it towards any car number
  * concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [143]:
sample_df = df[["CarNumber", "Make", "Model"]].sample(n=200, random_state=21)
sample_df["Refund"] = np.random.randint(low=1, high=6, size=200)
sample_df["Fines"] = np.random.uniform(low=1, high=500_001, size=200)
display(sample_df)

Unnamed: 0,CarNumber,Make,Model,Refund,Fines
445,M0299X197RUS,Ford,Focus,2,118765.20
22,83298C154RUS,Ford,Focus,1,119023.89
93,H957HY161RUS,Ford,Focus,5,303358.47
173,T941CC96RUS,Ford,Focus,1,168814.70
697,H966HY161RUS,Ford,Focus,1,87274.01
...,...,...,...,...,...
14,8182XX154RUS,Ford,Focus,4,332476.61
623,X796TH96RUS,Ford,Focus,4,82111.34
498,T011MY163RUS,Ford,Focus,4,115183.48
536,T341CC96RUS,Volkswagen,Passat,5,101974.89


In [144]:
concat_rows = pd.concat(
    objs=[df, sample_df], ignore_index=True
)  # ignore_index=True — creates a new numeric index from 0 to concat_rows.shape[0] - 1.
display(concat_rows)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,8182XX154RUS,4,332476.61,Ford,Focus
921,X796TH96RUS,4,82111.34,Ford,Focus
922,T011MY163RUS,4,115183.48,Ford,Focus
923,T341CC96RUS,5,101974.89,Volkswagen,Passat


* enrich the dataframe concat_rows by a new column with the data generated 
  * create a series with the name Year using random integers from 1980 to 2019
  * use np.random.seed(21) before generating the years
  * concatenate the series with the dataframe and name it fines

In [145]:
concat_df_rows_count = concat_rows.shape[0]
years = pd.Series(np.random.randint(low=1980, high=2019 + 1, size=concat_df_rows_count))
fines = pd.concat(objs=[concat_rows, years.rename("Year")], axis=1)
display(fines)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,2009
1,E432XX77RUS,1,6500.00,Toyota,Camry,2010
2,7184TT36RUS,1,2100.00,Ford,Focus,1990
3,X582HE161RUS,2,2000.00,Ford,Focus,2004
4,92918M178RUS,1,5700.00,Ford,Focus,1990
...,...,...,...,...,...,...
920,8182XX154RUS,4,332476.61,Ford,Focus,2012
921,X796TH96RUS,4,82111.34,Ford,Focus,2006
922,T011MY163RUS,4,115183.48,Ford,Focus,1985
923,T341CC96RUS,5,101974.89,Volkswagen,Passat,1986


* enrich the dataframe with the data from another dataframe
  * create a new dataframe with the car numbers and their owners
      * get the most popular surnames (**you can find the file [surname.json](../../datasets/surname.json) in the attachments**) in the US
      * create a new series with the surnames (they should not have
      special characters like commas, brackets, etc.) from the data you gathered, the count
      should be equal to the number of unique car numbers using the sample
      (use random_state = 21)
      * create the dataframe owners with 2 columns: CarNumber and
SURNAME 

In [146]:
data_file = "../../datasets/surname.json"
unique_car_numbers = fines["CarNumber"].unique()
n_unique = len(unique_car_numbers)


file_series = pd.read_json(path_or_buf=data_file, typ="series")
most_popular_surnames = file_series.iloc[1:].apply(
    lambda x: x[0]
)  # .iloc[1:] skips first row, which is header
random_surnames = pd.Series(np.random.choice(most_popular_surnames, size=n_unique))


owners_df = pd.DataFrame({"CarNumber": unique_car_numbers, "SURNAME": random_surnames})
display(owners_df)

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,HERNANDEZ
1,E432XX77RUS,HILL
2,7184TT36RUS,RIVERA
3,X582HE161RUS,RAMIREZ
4,92918M178RUS,YOUNG
...,...,...
526,O136HO197RUS,MORGAN
527,O22097197RUS,BENNETT
528,M0309X197RUS,COOPER
529,O673E8197RUS,RUIZ


  * append 5 more observations to the fines dataframe (come up with your own
ideas of CarNumber, etc.)

In [147]:
new_observations_fines = pd.DataFrame(
    {
        "CarNumber": ["NEW001", "NEW002", "NEW003", "NEW004", "NEW005"],
        "Refund": np.random.randint(low=1, high=5, size=5),
        "Fines": np.random.uniform(low=50, high=5000, size=5),
        "Make": ["Ford", "Toyota", "Honda", "Chevrolet", "Nissan"],
        "Model": ["Fiesta", "Corolla", "Civic", "Impala", "Altima"],
        "Year": np.random.randint(low=2000, high=2026, size=5),
    }
)

fines = pd.concat([fines, new_observations_fines], ignore_index=True)
display(fines)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,2009
1,E432XX77RUS,1,6500.00,Toyota,Camry,2010
2,7184TT36RUS,1,2100.00,Ford,Focus,1990
3,X582HE161RUS,2,2000.00,Ford,Focus,2004
4,92918M178RUS,1,5700.00,Ford,Focus,1990
...,...,...,...,...,...,...
925,NEW001,1,4373.09,Ford,Fiesta,2012
926,NEW002,2,3034.48,Toyota,Corolla,2008
927,NEW003,3,4288.84,Honda,Civic,2012
928,NEW004,1,4738.31,Chevrolet,Impala,2004


  * delete the dataframe last 20 observations from the owners and add 3 new
observations (they are not the same as those you add to the fines dataframe)

In [148]:
owners_df.drop(owners_df.index[-20:], inplace=True)
print("Owners DataFrame without last 20 records:")
display(owners_df)

new_observations_owners = pd.DataFrame(
    {
        "CarNumber": ["666RUS", "777RUS", "888RUS"],
        "SURNAME": ["MASK", "JOBS", "TORVALDS"],
    }
)
print("New 3 observations for owners DataFrame:")
display(new_observations_owners)

print("Owners DataFrame with new records:")
owners_df = pd.concat([owners_df, new_observations_owners], ignore_index=True)
display(owners_df)

Owners DataFrame without last 20 records:


Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,HERNANDEZ
1,E432XX77RUS,HILL
2,7184TT36RUS,RIVERA
3,X582HE161RUS,RAMIREZ
4,92918M178RUS,YOUNG
...,...,...
506,T914CT197RUS,ORTIZ
507,E41977152RUS,ALLEN
508,9464EX178RUS,WILLIAMS
509,O50197197RUS,MORRIS


New 3 observations for owners DataFrame:


Unnamed: 0,CarNumber,SURNAME
0,666RUS,MASK
1,777RUS,JOBS
2,888RUS,TORVALDS


Owners DataFrame with new records:


Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,HERNANDEZ
1,E432XX77RUS,HILL
2,7184TT36RUS,RIVERA
3,X582HE161RUS,RAMIREZ
4,92918M178RUS,YOUNG
...,...,...
509,O50197197RUS,MORRIS
510,7608EE777RUS,EVANS
511,666RUS,MASK
512,777RUS,JOBS


  * join both dataframes:
    * the new dataframe should have only the car numbers that exist in
both dataframes

In [149]:
result_1 = pd.merge(fines, owners_df, on="CarNumber", how="inner")
display(result_1)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2009,HERNANDEZ
1,E432XX77RUS,1,6500.00,Toyota,Camry,2010,HILL
2,7184TT36RUS,1,2100.00,Ford,Focus,1990,RIVERA
3,X582HE161RUS,2,2000.00,Ford,Focus,2004,RAMIREZ
4,92918M178RUS,1,5700.00,Ford,Focus,1990,YOUNG
...,...,...,...,...,...,...,...
894,8182XX154RUS,4,332476.61,Ford,Focus,2012,TORRES
895,X796TH96RUS,4,82111.34,Ford,Focus,2006,MARTIN
896,T011MY163RUS,4,115183.48,Ford,Focus,1985,EDWARDS
897,T341CC96RUS,5,101974.89,Volkswagen,Passat,1986,KIM


* the new dataframe should have all the car numbers that exist in
both
dataframes 

In [150]:
result_2 = pd.merge(fines, owners_df, on="CarNumber", how="outer")
display(result_2)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,666RUS,,,,,,MASK
1,704687163RUS,2.00,1400.00,Ford,Focus,2005.00,MYERS
2,704787163RUS,2.00,2800.00,Ford,Focus,1995.00,HARRIS
3,704987163RUS,2.00,8594.59,Ford,Focus,1988.00,MORRIS
4,705287163RUS,2.00,2000.00,Ford,Focus,1990.00,PRICE
...,...,...,...,...,...,...,...
928,Y973O8197RUS,2.00,8594.59,Ford,Focus,1987.00,HALL
929,Y973O8197RUS,1.00,34800.00,Ford,Focus,1981.00,HALL
930,Y973O8197RUS,1.00,69600.00,Ford,Focus,2002.00,HALL
931,Y973O8197RUS,2.00,35390.72,Ford,Focus,2001.00,HALL


* the new dataframe should have only the car numbers from the
fines dataframe 

In [151]:
result_3 = pd.merge(fines, owners_df, on="CarNumber", how="left")
display(result_3)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2009,HERNANDEZ
1,E432XX77RUS,1,6500.00,Toyota,Camry,2010,HILL
2,7184TT36RUS,1,2100.00,Ford,Focus,1990,RIVERA
3,X582HE161RUS,2,2000.00,Ford,Focus,2004,RAMIREZ
4,92918M178RUS,1,5700.00,Ford,Focus,1990,YOUNG
...,...,...,...,...,...,...,...
925,NEW001,1,4373.09,Ford,Fiesta,2012,
926,NEW002,2,3034.48,Toyota,Corolla,2008,
927,NEW003,3,4288.84,Honda,Civic,2012,
928,NEW004,1,4738.31,Chevrolet,Impala,2004,


* the new dataframe should have only the car numbers from the
owners
dataframe

In [152]:
result_4 = pd.merge(fines, owners_df, on="CarNumber", how="right")
display(result_4)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2009.00,HERNANDEZ
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1992.00,HERNANDEZ
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,2010.00,HILL
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,1985.00,HILL
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1990.00,RIVERA
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,1982.00,EVANS
898,7608EE777RUS,4.00,202622.41,Skoda,Octavia,2014.00,EVANS
899,666RUS,,,,,,MASK
900,777RUS,,,,,,JOBS


* create a pivot table from the fines dataframe, it should look like this (the values are
the sums of the fines), but with all the years (the values may be different for you):

In [153]:
pivot = pd.pivot_table(
    data=fines, index=["Make", "Model"], columns="Year", values="Fines", aggfunc="sum"
)
display(pivot)

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2025
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Chevrolet,Impala,,,,,,,,,,,...,,,,,,,,,,
Ford,Fiesta,,,,,,,,,,,...,,4373.09,,,,,,,,
Ford,Focus,1608367.64,753564.99,363286.83,1382278.12,908341.92,2728401.44,1897421.63,1084276.12,1229024.69,418223.0,...,1074990.18,2105857.88,1533380.28,1192185.95,1410936.16,573854.61,1877233.56,900672.86,24600.0,
Ford,Mondeo,,,,,2200.0,,,,,,...,34400.0,,,,,,,,6700.0,
Honda,Civic,,,,,,,,,,,...,,4288.84,,,,,,,,
Nissan,Altima,,,,,,,,,,,...,,,,,,,,,,2635.47
Skoda,Octavia,500.0,49394.59,456529.62,8594.59,500.0,1200.0,5500.0,,5700.0,,...,500.0,,14594.59,202622.41,,,1000.0,27494.59,8500.0,
Toyota,Camry,185379.12,7200.0,8594.59,1000.0,,13000.0,800.0,,382444.07,500.0,...,10600.0,12000.0,,,,7500.0,,,1000.0,
Toyota,Corolla,,,15800.0,,164462.01,,,,,,...,8594.59,,6800.0,,7600.0,,35200.0,352610.68,,
Volkswagen,Golf,,,,500.0,,,200.0,300.0,,,...,,497180.26,9300.0,,179239.16,31900.0,168000.0,,4600.0,


* save both the fines and owners dataframes to CSV files without an index

In [154]:
fines_output_file_path = "../data/fines.csv"
owners_output_file_path = "../data/owners.csv"

fines.to_csv(path_or_buf=fines_output_file_path, sep=",", index=False)
owners_df.to_csv(path_or_buf=owners_output_file_path, sep=",", index=False)