***
# Car sales
***

### Déterminer les ventes annuelles de 2017 à 2021

<u>Méthodologie</u> : <br>  
- ventes = stock_t - stock_t-1

Note : possiblité de sous-estimer les ventes puisque certain véhicules peuvent sortir ou entrer le parc (accident, remisage, véhicules saisoniers) 
***

In [1]:
# Create folders and getting path
import os
cwd = os.getcwd()

# Loop to create folders
folder_names = ['Dataframe', 'Output', 'Data']

folders = {}
for folder_name in folder_names:
    folders[folder_name] = os.path.join(cwd, folder_name)

    if not os.path.exists(os.path.join(cwd, folder_name)):
        os.makedirs(os.path.join(cwd, folder_name))
        print(f'Le dossier « {folder_name} » a été créé')

    else:
        print(f'Le dossier « {folder_name} » est existant')


# Création des variables de path

# Déterminer si os est win ou linux pour définir les path
if os.name == 'nt':
    slash = '\\'
elif os.name == 'posix':
    slash = '/'

path_prog =     cwd + slash
path_data =     folders['Data'] + slash
path_df =       folders['Dataframe'] + slash
path_output =   folders['Output'] + slash

# Mettre \\ pour éviter les erreurs
path_dict = [path_prog, path_data, path_df, path_output]
for path in path_dict:
    path = path.replace('\\','\\\\')


# Détermination de l'année
print()
print(path_prog)
print(path_data)
print(path_df)
print(path_output)

Le dossier « Dataframe » est existant
Le dossier « Output » est existant
Le dossier « Data » est existant

c:\Users\segch001\Documents\GitHub\car_sales_predictions\
c:\Users\segch001\Documents\GitHub\car_sales_predictions\Data\
c:\Users\segch001\Documents\GitHub\car_sales_predictions\Dataframe\
c:\Users\segch001\Documents\GitHub\car_sales_predictions\Output\


***

In [24]:
import pandas as pd
pd.set_option('display.max_columns', 1000)

saaq2016 = pd.read_csv(path_data + 'vehicules-circulation-2016.csv')
saaq2017 = pd.read_csv(path_data + 'vehicule-en-circulation-2017.csv')
saaq2018 = pd.read_csv(path_data + 'vehicule-en-circulation-2018.csv')

In [29]:
# Keep light duty vehicles
saaq_df = {
    # 2016 : saaq2016, 
    2017 : saaq2017,
    2018 : saaq2018
}

for year, df in saaq_df.items():
    print('-'*25)
    print(f'SAAQ_{year}')
    print('-'*25)
    print(f'Shape initial : {df.shape}')
    print(f'Nombre d\'observation total SAAQ_{year} : {len(df)}')
    saaq_df[year] = df.loc[(df['CLAS'] == 'PAU') | (df['CLAS'] == 'CAU') | (df['CLAS'] == 'TTA')]
    print(f'Nombre d\'observation light duty SAAQ_{year}: {len(saaq_df[year])}')
    print(df.columns.tolist())

-------------------------
SAAQ_2017
-------------------------
Shape initial : (6552488, 19)
Nombre d'observation total SAAQ_2017 : 6552488
Nombre d'observation light duty SAAQ_2017: 5231017
['AN', 'NOSEQ_VEH', 'CLAS', 'TYP_VEH_CATEG_USA', 'MARQ_VEH', 'MODEL_VEH', 'ANNEE_MOD', 'MASSE_NETTE', 'NB_CYL', 'CYL_VEH', 'NB_ESIEU_MAX', 'COUL_ORIG', 'TYP_CARBU', 'TYP_DOSS_PERS', 'PHYS_SEX', 'PHYS_AGE', 'REG_ADM', 'MRC', 'CG_FIXE']
-------------------------
SAAQ_2018
-------------------------
Shape initial : (6608276, 19)
Nombre d'observation total SAAQ_2018 : 6608276
Nombre d'observation light duty SAAQ_2018: 5259496
['AN', 'NOSEQ_VEH', 'CLAS', 'TYP_VEH_CATEG_USA', 'MARQ_VEH', 'MODEL_VEH', 'ANNEE_MOD', 'MASSE_NETTE', 'NB_CYL', 'CYL_VEH', 'NB_ESIEU_MAX', 'COUL_ORIG', 'TYP_CARBU', 'TYP_DOSS_PERS', 'PHYS_SEX', 'PHYS_AGE', 'REG_ADM', 'MRC', 'CG_FIXE']


In [30]:
saaq_df[2017]['CLAS'].unique()

array(['PAU', 'CAU', 'TTA'], dtype=object)

In [31]:
# Data type
saaq_df[2017].dtypes

AN                     int64
NOSEQ_VEH             object
CLAS                  object
TYP_VEH_CATEG_USA     object
MARQ_VEH              object
MODEL_VEH             object
ANNEE_MOD            float64
MASSE_NETTE          float64
NB_CYL               float64
CYL_VEH              float64
NB_ESIEU_MAX         float64
COUL_ORIG             object
TYP_CARBU             object
TYP_DOSS_PERS         object
PHYS_SEX              object
PHYS_AGE             float64
REG_ADM               object
MRC                   object
CG_FIXE              float64
dtype: object

In [32]:
# Missing values
saaq_df[2017].isna().sum()

AN                         0
NOSEQ_VEH                  0
CLAS                       0
TYP_VEH_CATEG_USA          0
MARQ_VEH                 432
MODEL_VEH               5530
ANNEE_MOD                  0
MASSE_NETTE             1039
NB_CYL                 15855
CYL_VEH                25350
NB_ESIEU_MAX         5231017
COUL_ORIG             434993
TYP_CARBU                  2
TYP_DOSS_PERS              0
PHYS_SEX              530324
PHYS_AGE              530324
REG_ADM                21594
MRC                    21594
CG_FIXE                21594
dtype: int64

In [33]:
# Delete missing values
for year, df in saaq_df.items():
    saaq_df[year] = df.dropna(subset='MODEL_VEH')

# Pour l'instant on delete les modèle, mais éventuellement, il pourrait être intéressant de déterminer quelle marque le véhicule est avec ses caractéristiques

In [34]:
saaq_df[2017].isna().sum()

AN                         0
NOSEQ_VEH                  0
CLAS                       0
TYP_VEH_CATEG_USA          0
MARQ_VEH                   0
MODEL_VEH                  0
ANNEE_MOD                  0
MASSE_NETTE             1016
NB_CYL                 15757
CYL_VEH                20123
NB_ESIEU_MAX         5225487
COUL_ORIG             430415
TYP_CARBU                  2
TYP_DOSS_PERS              0
PHYS_SEX              529834
PHYS_AGE              529834
REG_ADM                21577
MRC                    21577
CG_FIXE                21577
dtype: int64

*** 
# Estimation des ventes

### <u>À titre de comparatif </u>

<u>Les ventes en 2017</u> <br>
selon : https://www.automedia.ca/ventes-au-quebec-annee-2017/ <br>
Voiture = 187 529 <br>
Camion léger = 274 558 <br>
Total = 462 087 <br>
<br>

<u>Plus récemment</u> <br>
selon : https://www.automedia.ca/stats-ventes-2022-fev-mars-2023/
GRAND TOTAL DES VENTES DE VÉHICULES NEUFS AU QUÉBEC : 2022 = 369982, 2023 = 406980

selon : https://www.protegez-vous.ca/nouvelles/automobile/palmares-des-ventes-de-vehicules-neufs-en-2022#:~:text=Ainsi%2C%20contrairement%20%C3%A0%202021%2C%20les,recul%20de%2010%2C9%20%25. <br>
« Ainsi, contrairement à 2021, les ventes totales de véhicules neufs ont baissé en 2022. Elles sont passées de 400 844 à 357 030 unités »
***

In [35]:
saaq_2017 = saaq_df[2017].loc[saaq_df[2017]['ANNEE_MOD'] >= 2017]
saaq_2018 = saaq_df[2018].loc[saaq_df[2018]['ANNEE_MOD'] >= 2017]
# Note : si on prend juste les années modèle >= 2017, on surestime les ventes



In [37]:
print(len(saaq_2017))
print(len(saaq_2018))

print(len(saaq2018) - len(saaq2017))

496045
914952
55788


In [38]:
# Add column key for merging
columns_to_join = ['CLAS', 'MARQ_VEH', 'MARQ_VEH', 'MARQ_VEH', 'MASSE_NETTE', 'NB_CYL', 'CYL_VEH', 'COUL_ORIG', 'TYP_CARBU']

def join_columns(row):
    return '_'.join(str(row[col]) for col in columns_to_join)

for year, df in saaq_df.items():
    saaq_df[year] = df.loc[df['ANNEE_MOD'] >= 2017]
    print(len(saaq_df[year]))
    saaq_df[year]['key'] = df.apply(join_columns, axis=1)

496045


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  saaq_df[year]['key'] = df.apply(join_columns, axis=1)


914952


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  saaq_df[year]['key'] = df.apply(join_columns, axis=1)


In [40]:
# Merge 2017 à 2018, les véhicules qui ne se retrouvent pas dans les 2 df seront les ventes
chunk_size = 100
merged_results = pd.DataFrame()

for chunk_2017, chunk_2018 in zip([saaq_df[2017][i:i+chunk_size] for i in range(0, len(saaq_df[2017]), chunk_size)],
                                   [saaq_df[2018][i:i+chunk_size] for i in range(0, len(saaq_df[2018]), chunk_size)]):
    # Merge the chunks on the 'key' column
    chunk_merged = pd.merge(chunk_2017, chunk_2018, on='key', how='outer', suffixes=('_2017', '_2018'), indicator=True)
    
    # Append the merged chunk to the results DataFrame
    merged_results = pd.concat([merged_results, chunk_merged], ignore_index=False)

merged_results


Unnamed: 0,AN_2017,NOSEQ_VEH_2017,CLAS_2017,TYP_VEH_CATEG_USA_2017,MARQ_VEH_2017,MODEL_VEH_2017,ANNEE_MOD_2017,MASSE_NETTE_2017,NB_CYL_2017,CYL_VEH_2017,NB_ESIEU_MAX_2017,COUL_ORIG_2017,TYP_CARBU_2017,TYP_DOSS_PERS_2017,PHYS_SEX_2017,PHYS_AGE_2017,REG_ADM_2017,MRC_2017,CG_FIXE_2017,key,AN_2018,NOSEQ_VEH_2018,CLAS_2018,TYP_VEH_CATEG_USA_2018,MARQ_VEH_2018,MODEL_VEH_2018,ANNEE_MOD_2018,MASSE_NETTE_2018,NB_CYL_2018,CYL_VEH_2018,NB_ESIEU_MAX_2018,COUL_ORIG_2018,TYP_CARBU_2018,TYP_DOSS_PERS_2018,PHYS_SEX_2018,PHYS_AGE_2018,REG_ADM_2018,MRC_2018,CG_FIXE_2018,_merge
0,2017.0,2017_0000000016,PAU,AU,FORD,ESCAP,2017.0,1662.0,4.0,2001.0,,ROU,E,P,F,76.0,Centre-du-Québec (17),Drummond (49 ),49030.0,PAU_FORD_FORD_FORD_1662.0_4.0_2001.0_ROU_E,,,,,,,,,,,,,,,,,,,,left_only
1,2017.0,2017_0000000022,PAU,AU,CHEVR,VOLT,2017.0,1578.0,4.0,1509.0,,NOI,W,P,M,33.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,PAU_CHEVR_CHEVR_CHEVR_1578.0_4.0_1509.0_NOI_W,,,,,,,,,,,,,,,,,,,,left_only
2,2017.0,2017_0000000041,PAU,AU,NISSA,VERSA,2017.0,1113.0,4.0,1607.0,,GRI,E,P,F,87.0,Laurentides (15),Les Laurentides (78 ),78010.0,PAU_NISSA_NISSA_NISSA_1113.0_4.0_1607.0_GRI_E,,,,,,,,,,,,,,,,,,,,left_only
3,2017.0,2017_0000000044,PAU,AU,TOYOT,YARIS,2018.0,1050.0,4.0,1492.0,,ROU,E,P,M,57.0,Lanaudière (14),Les Moulins (64 ),64008.0,PAU_TOYOT_TOYOT_TOYOT_1050.0_4.0_1492.0_ROU_E,,,,,,,,,,,,,,,,,,,,left_only
4,2017.0,2017_0000000071,PAU,AU,NISSA,PATHF,2017.0,2018.0,6.0,3509.0,,BLA,E,P,M,24.0,Mauricie (04),Maskinongé (51 ),51015.0,PAU_NISSA_NISSA_NISSA_2018.0_6.0_3509.0_BLA_E,,,,,,,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,,,,,,,,,,,,,,,,,,,,PAU_MAZDA_MAZDA_MAZDA_1621.0_4.0_2493.0_NOI_E,2018.0,2018_0003580982,PAU,AU,MAZDA,CX-5,2018.0,1621.0,4.0,2493.0,,NOI,E,P,F,52.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,right_only
140,,,,,,,,,,,,,,,,,,,,PAU_LEXUS_LEXUS_LEXUS_1755.0_4.0_2001.0_NOI_E,2018.0,2018_0003580993,PAU,AU,LEXUS,NX,2017.0,1755.0,4.0,2001.0,,NOI,E,P,M,62.0,Estrie (05),Sherbrooke (43 ),43027.0,right_only
141,,,,,,,,,,,,,,,,,,,,PAU_TOYOT_TOYOT_TOYOT_1050.0_4.0_1492.0_ARG_E,2018.0,2018_0003581005,PAU,AU,TOYOT,YARIS,2018.0,1050.0,4.0,1492.0,,ARG,E,P,M,66.0,Montréal (06),Montréal (66 ),66058.0,right_only
142,,,,,,,,,,,,,,,,,,,,PAU_FORD_FORD_FORD_1892.0_6.0_2706.0_NOI_E,2018.0,2018_0003581008,PAU,AU,FORD,EDGE,2018.0,1892.0,6.0,2706.0,,NOI,E,P,M,61.0,Bas-Saint-Laurent (01),Témiscouata (13 ),13090.0,right_only


In [45]:
vente2018 = merged_results.loc[merged_results['_merge'] == 'right_only']

In [55]:
columns_to_keep = vente2018.columns[~vente2018.columns.str.endswith('_2017')]
vente2018_filtered = vente2018[columns_to_keep].drop(columns='key').reset_index(drop=True)

In [56]:
vente2018_filtered

Unnamed: 0,AN_2018,NOSEQ_VEH_2018,CLAS_2018,TYP_VEH_CATEG_USA_2018,MARQ_VEH_2018,MODEL_VEH_2018,ANNEE_MOD_2018,MASSE_NETTE_2018,NB_CYL_2018,CYL_VEH_2018,NB_ESIEU_MAX_2018,COUL_ORIG_2018,TYP_CARBU_2018,TYP_DOSS_PERS_2018,PHYS_SEX_2018,PHYS_AGE_2018,REG_ADM_2018,MRC_2018,CG_FIXE_2018,_merge
0,2018.0,2018_0000000011,PAU,AU,BMW,X3,2018.0,1885.0,4.0,2001.0,,BLA,E,P,F,50.0,Capitale-Nationale (03),La Jacques-Cartier (22 ),22015.0,right_only
1,2018.0,2018_0000000020,PAU,AU,FORD,F150,2018.0,2150.0,6.0,2706.0,,NOI,E,P,M,34.0,Lanaudière (14),Joliette (61 ),61050.0,right_only
2,2018.0,2018_0000000023,PAU,AU,NISSA,KICKS,2019.0,1430.0,4.0,1607.0,,ROU,E,P,M,48.0,Estrie (05),Sherbrooke (43 ),43027.0,right_only
3,2018.0,2018_0000000030,PAU,AU,ACURA,RDX,2018.0,1703.0,6.0,3509.0,,NOI,E,P,M,28.0,Capitale-Nationale (03),Québec (23 ),23027.0,right_only
4,2018.0,2018_0000000033,PAU,AU,HYUND,SANTA,2017.0,1681.0,4.0,2361.0,,GRI,E,P,F,33.0,Bas-Saint-Laurent (01),Rimouski-Neigette (10 ),10043.0,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471068,2018.0,2018_0003580982,PAU,AU,MAZDA,CX-5,2018.0,1621.0,4.0,2493.0,,NOI,E,P,F,52.0,Saguenay–Lac-Saint-Jean (02),Saguenay (941),94068.0,right_only
471069,2018.0,2018_0003580993,PAU,AU,LEXUS,NX,2017.0,1755.0,4.0,2001.0,,NOI,E,P,M,62.0,Estrie (05),Sherbrooke (43 ),43027.0,right_only
471070,2018.0,2018_0003581005,PAU,AU,TOYOT,YARIS,2018.0,1050.0,4.0,1492.0,,ARG,E,P,M,66.0,Montréal (06),Montréal (66 ),66058.0,right_only
471071,2018.0,2018_0003581008,PAU,AU,FORD,EDGE,2018.0,1892.0,6.0,2706.0,,NOI,E,P,M,61.0,Bas-Saint-Laurent (01),Témiscouata (13 ),13090.0,right_only


In [58]:
vente2018_filtered['NOSEQ_VEH_2018'].duplicated

<bound method Series.duplicated of 0         2018_0000000011
1         2018_0000000020
2         2018_0000000023
3         2018_0000000030
4         2018_0000000033
               ...       
471068    2018_0003580982
471069    2018_0003580993
471070    2018_0003581005
471071    2018_0003581008
471072    2018_0003581013
Name: NOSEQ_VEH_2018, Length: 471073, dtype: object>

In [None]:
saaq_years = range(2016,2021 + 1)

for year in saaq_years: 
    sales.to_hdf(path_df + f'sales{year}')