***
# Merge data
***

In [1]:
# Create folders and getting path
import os
cwd = os.getcwd()

# Loop to create folders
folder_names = ['Dataframe', 'Output', 'Data']

folders = {}
for folder_name in folder_names:
    folders[folder_name] = os.path.join(cwd, folder_name)

    if not os.path.exists(os.path.join(cwd, folder_name)):
        os.makedirs(os.path.join(cwd, folder_name))
        print(f'Le dossier « {folder_name} » a été créé')

    else:
        print(f'Le dossier « {folder_name} » est existant')


# Création des variables de path

# Déterminer si os est win ou linux pour définir les path
if os.name == 'nt':
    slash = '\\'
elif os.name == 'posix':
    slash = '/'

path_prog =     cwd + slash
path_data =     folders['Data'] + slash
path_df =       folders['Dataframe'] + slash
path_output =   folders['Output'] + slash

# Mettre \\ pour éviter les erreurs
path_dict = [path_prog, path_data, path_df, path_output]
for path in path_dict:
    path = path.replace('\\','\\\\')


# Détermination de l'année
print()
print(path_prog)
print(path_data)
print(path_df)
print(path_output)

Le dossier « Dataframe » est existant
Le dossier « Output » est existant
Le dossier « Data » est existant

c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Data\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Dataframe\
c:\Users\Charles_tour\Documents\GitHub\car_sales_forcast\Output\


In [2]:
import concurrent.futures
import numpy as np
import pandas as pd
import re
import time

pd.set_option('display.max_columns', None)

In [7]:
# Import data
scrap_data = pd.read_hdf(path_df + 'scrap_data.h5')

# Scrap_data key column
key_col = ['brand', 'model', 'year']

scrap_data['merge_key'] = scrap_data[key_col].apply(lambda x: x.str.upper()).apply(lambda x: '_'.join(x), axis=1)
scrap_data.head(3)

Unnamed: 0,link_model_an,year,model,brand,prix_min,prix_max,cons_min,cons_max,merge_key
0,https://www.guideautoweb.com/constructeurs/acu...,2023,integra,acura,34350.0,42550.0,6.3,8.9,ACURA_INTEGRA_2023
1,https://www.guideautoweb.com/constructeurs/acu...,2023,mdx,acura,59300.0,84400.0,9.4,13.8,ACURA_MDX_2023
2,https://www.guideautoweb.com/constructeurs/acu...,2022,mdx,acura,57900.0,83000.0,9.4,13.8,ACURA_MDX_2022


In [3]:
# Import sales data
years = range(2017,2022)

df_dict = {}

for year in years:
    df_dict[year] = pd.read_hdf(path_df + f'vente_data_{year}.h5')

In [5]:
# Colonne key pour df vente
key_col = ['MARQ_VEH', 'MODEL_VEH', 'ANNEE_MOD']

for year in years:
    df_dict[year]['ANNEE_MOD'] = df_dict[year]['ANNEE_MOD'].astype(int)
    df_dict[year]['merge_key'] = df_dict[year][key_col].astype(str).apply(lambda x: '.*'.join(x), axis=1)

df_dict[2017].head(2)

Unnamed: 0,AN,NOSEQ_VEH,CLAS,TYP_VEH_CATEG_USA,MARQ_VEH,MODEL_VEH,ANNEE_MOD,MASSE_NETTE,NB_CYL,CYL_VEH,NB_ESIEU_MAX,COUL_ORIG,TYP_CARBU,TYP_DOSS_PERS,PHYS_SEX,PHYS_AGE,REG_ADM,MRC,CG_FIXE,merge_key
0,2017,2017_0000000016,PAU,AU,FORD,ESCAP,2017,1662.0,4.0,2001.0,,ROU,E,P,F,76.0,Centre-du-Québec (17),Drummond (49 ),49030.0,FORD.*ESCAP.*2017
1,2017,2017_0000000022,PAU,AU,CHEVR,VOLT,2017,1578.0,4.0,1509.0,,NOI,W,P,M,33.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,CHEVR.*VOLT.*2017


In [8]:
# fonction qui associe une valeur clef du df scrap pour chaque observations des ventes SAAQ
def find_best_match(df_to_match):
    matche = [x for x in scrap_data['merge_key'].values if re.search(df_to_match, x)]
    return matche[0] if matche else None

# Fonction qui calcul le temps pour rouler un df
def time_duration(start_time):
    end_time = time.time()
    duration = (end_time - start_time)/60
    print(f'{round(duration), 0} min')


with concurrent.futures.ThreadPoolExecutor() as executor:
    for year in years:
        start_time = time.time()
        df_dict[year]['match'] = list(executor.map(find_best_match, df_dict[year]['merge_key']))
        print(f'Time for df {year} : {time_duration(start_time)}')
# 15m

14.269578619798024 min
27.929734031359356 min
41.06696108182271 min
52.21571315129598 min
64.27015091180802 min


In [9]:
df_dict[2017].head(2)

Unnamed: 0,AN,NOSEQ_VEH,CLAS,TYP_VEH_CATEG_USA,MARQ_VEH,MODEL_VEH,ANNEE_MOD,MASSE_NETTE,NB_CYL,CYL_VEH,NB_ESIEU_MAX,COUL_ORIG,TYP_CARBU,TYP_DOSS_PERS,PHYS_SEX,PHYS_AGE,REG_ADM,MRC,CG_FIXE,merge_key,match
0,2017,2017_0000000016,PAU,AU,FORD,ESCAP,2017,1662.0,4.0,2001.0,,ROU,E,P,F,76.0,Centre-du-Québec (17),Drummond (49 ),49030.0,FORD.*ESCAP.*2017,FORD_ESCAPE_2017
1,2017,2017_0000000022,PAU,AU,CHEVR,VOLT,2017,1578.0,4.0,1509.0,,NOI,W,P,M,33.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,CHEVR.*VOLT.*2017,CHEVROLET_VOLT_2017


In [10]:
# Nombre de match
print('STATISTIQUE DES MATCHES')

for year in years:
    nb_observation = len(df_dict[year])
    nb_match = len(df_dict[year].loc[df_dict[year]['match']!=None])
    non_match = nb_observation - nb_match
    
    print('-' * 40)
    print(f'Année : {year}')
    print(f'Nombre de vente : {nb_observation}')
    print(f'Nombre de match : {nb_match}')
    print(f'Nombre d\'observation non matchée : {non_match}')

STATISTIQUE DES MATCHES
----------------------------------------
Année : 2017
Nombre de vente : 496045
Nombre de match : 496045
Nombre d'observation non matchée : 0
----------------------------------------
Année : 2018
Nombre de vente : 477643
Nombre de match : 477643
Nombre d'observation non matchée : 0
----------------------------------------
Année : 2019
Nombre de vente : 458341
Nombre de match : 458341
Nombre d'observation non matchée : 0
----------------------------------------
Année : 2020
Nombre de vente : 389059
Nombre de match : 389059
Nombre d'observation non matchée : 0
----------------------------------------
Année : 2021
Nombre de vente : 421287
Nombre de match : 421287
Nombre d'observation non matchée : 0


In [11]:
# Merge

merge_data = {}
for year in years:
    merge_data[year] = df_dict[year].merge(scrap_data, how='left', left_on='match', right_on='merge_key', indicator=True)
merge_data[2017].head(2)

Unnamed: 0,AN,NOSEQ_VEH,CLAS,TYP_VEH_CATEG_USA,MARQ_VEH,MODEL_VEH,ANNEE_MOD,MASSE_NETTE,NB_CYL,CYL_VEH,NB_ESIEU_MAX,COUL_ORIG,TYP_CARBU,TYP_DOSS_PERS,PHYS_SEX,PHYS_AGE,REG_ADM,MRC,CG_FIXE,merge_key_x,match,link_model_an,year,model,brand,prix_min,prix_max,cons_min,cons_max,merge_key_y,_merge
0,2017,2017_0000000016,PAU,AU,FORD,ESCAP,2017,1662.0,4.0,2001.0,,ROU,E,P,F,76.0,Centre-du-Québec (17),Drummond (49 ),49030.0,FORD.*ESCAP.*2017,FORD_ESCAPE_2017,https://www.guideautoweb.com/constructeurs/for...,2017,escape,ford,25099.0,35999.0,7.8,11.5,FORD_ESCAPE_2017,both
1,2017,2017_0000000022,PAU,AU,CHEVR,VOLT,2017,1578.0,4.0,1509.0,,NOI,W,P,M,33.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,CHEVR.*VOLT.*2017,CHEVROLET_VOLT_2017,https://www.guideautoweb.com/constructeurs/che...,2017,volt,chevrolet,38790.0,42890.0,5.5,5.6,CHEVROLET_VOLT_2017,both
2,2017,2017_0000000041,PAU,AU,NISSA,VERSA,2017,1113.0,4.0,1607.0,,GRI,E,P,F,87.0,Laurentides (15),Les Laurentides (78 ),78010.0,NISSA.*VERSA.*2017,NISSAN_VERSA-NOTE_2017,https://www.guideautoweb.com/constructeurs/nis...,2017,versa-note,nissan,14498.0,19748.0,6.2,8.6,NISSAN_VERSA-NOTE_2017,both
3,2017,2017_0000000044,PAU,AU,TOYOT,YARIS,2018,1050.0,4.0,1492.0,,ROU,E,P,M,57.0,Lanaudière (14),Les Moulins (64 ),64008.0,TOYOT.*YARIS.*2018,TOYOTA_YARIS_2018,https://www.guideautoweb.com/constructeurs/toy...,2018,yaris,toyota,15490.0,20530.0,5.8,7.9,TOYOTA_YARIS_2018,both
4,2017,2017_0000000071,PAU,AU,NISSA,PATHF,2017,2018.0,6.0,3509.0,,BLA,E,P,M,24.0,Mauricie (04),Maskinongé (51 ),51015.0,NISSA.*PATHF.*2017,NISSAN_PATHFINDER_2017,https://www.guideautoweb.com/constructeurs/nis...,2017,pathfinder,nissan,32598.0,48598.0,8.5,12.4,NISSAN_PATHFINDER_2017,both


In [24]:
# Clean, concat and save
df_unique = pd.DataFrame()

for year in years : 
    merge_data[year] = merge_data[year].drop(columns=['merge_key_x', 'match', 'merge_key_y', '_merge'])
    df_unique = pd.concat([df_unique, merge_data[year]], axis=0)

df_unique.to_hdf(path_df + f'df_unique.h5', key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['NOSEQ_VEH', 'CLAS', 'TYP_VEH_CATEG_USA', 'MARQ_VEH', 'MODEL_VEH',
       'COUL_ORIG', 'TYP_CARBU', 'TYP_DOSS_PERS', 'PHYS_SEX', 'REG_ADM', 'MRC',
       'link_model_an', 'year', 'model', 'brand'],
      dtype='object')]

  df_unique.to_hdf(path_df + f'df_unique.h5', key='s')
