# Packages

In [1]:
import pandas as pd
import wandb




# Weights and Biases

In [2]:
def log_dataset_to_wandb(dataset_name, file_path):
    """
    Logs a dataset file to Weights and Biases as an artifact.
    Args:
        dataset_name (str): Name of the dataset artifact to create in Weights and Biases.
        file_path (str): Path to the local dataset file.
    """
    WndbSwitch = False
    if not WndbSwitch:
        return
    run = wandb.init(project="GNNDemandForecasting", job_type="dataset-creation")

    # Create a new artifact for a dataset
    artifact_log = wandb.Artifact(dataset_name, type='dataset')
    artifact_log.add_file(file_path)

    run.log_artifact(artifact_log)

    wandb.finish()

# Merge Files

In [3]:
holiday_events_path = "../source_files/holidays_events.csv"
oil_path = "../source_files/oil.csv"
stores_path = "../source_files/stores.csv"
test_path = "../source_files/test.csv"
train_path = "../source_files/train.csv"

In [4]:
holiday_events_df = pd.read_csv(holiday_events_path)
holiday_events_df = holiday_events_df.rename(columns={'type': 'day_type'})
holiday_events_df = holiday_events_df.rename(columns={'locale': 'day_locale'})
holiday_events_df = holiday_events_df.rename(columns={'locale_name': 'day_locale_name'})
holiday_events_df = holiday_events_df.rename(columns={'description': 'day_description'})
holiday_events_df = holiday_events_df.rename(columns={'transferred': 'day_transferred'})

holiday_events_df.head()

Unnamed: 0,date,day_type,day_locale,day_locale_name,day_description,day_transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [5]:
oil_df = pd.read_csv(oil_path)
oil_df = oil_df.rename(columns={'dcoilwtico': 'oil_price'})

oil_df.head()

Unnamed: 0,date,oil_price
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [6]:
stores_df = pd.read_csv(stores_path)
stores_df = stores_df.rename(columns={'city': 'store_city'})
stores_df = stores_df.rename(columns={'state': 'store_state'})
stores_df = stores_df.rename(columns={'type': 'store_type'})
stores_df = stores_df.rename(columns={'cluster': 'store_cluster'})

stores_df.head()

Unnamed: 0,store_nbr,store_city,store_state,store_type,store_cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [7]:
test_df = pd.read_csv(test_path)
test_df.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [8]:
train_df = pd.read_csv(train_path)
train_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [9]:
def merge_dfs(df):
    df = df.merge(holiday_events_df, how='left', on='date')
    df = df.merge(oil_df, how='left', on='date')
    df = df.merge(stores_df, how='left', on='store_nbr')
    return df
    

In [10]:
train_merged = merge_dfs(train_df)
train_merged.to_csv("../processed_files/train_merged.csv", index=False)
# log_dataset_to_wandb("train_merged", "../processed_files/train_merged.csv")
train_merged.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_type,day_locale,day_locale_name,day_description,day_transferred,oil_price,store_city,store_state,store_type,store_cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Holiday,National,Ecuador,Primer dia del ano,False,,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,Holiday,National,Ecuador,Primer dia del ano,False,,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,Holiday,National,Ecuador,Primer dia del ano,False,,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,Holiday,National,Ecuador,Primer dia del ano,False,,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,Holiday,National,Ecuador,Primer dia del ano,False,,Quito,Pichincha,D,13


In [11]:
test_merged = merge_dfs(test_df)
test_merged.to_csv("../processed_files/test_merged.csv", index=False)
# log_dataset_to_wandb("test_merged", "../processed_files/test_merged.csv")
test_merged.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,day_type,day_locale,day_locale_name,day_description,day_transferred,oil_price,store_city,store_state,store_type,store_cluster
0,3000888,2017-08-16,1,AUTOMOTIVE,0,,,,,,46.8,Quito,Pichincha,D,13
1,3000889,2017-08-16,1,BABY CARE,0,,,,,,46.8,Quito,Pichincha,D,13
2,3000890,2017-08-16,1,BEAUTY,2,,,,,,46.8,Quito,Pichincha,D,13
3,3000891,2017-08-16,1,BEVERAGES,20,,,,,,46.8,Quito,Pichincha,D,13
4,3000892,2017-08-16,1,BOOKS,0,,,,,,46.8,Quito,Pichincha,D,13


# Preprocess
- Remove nan

## Impute NaN

In [None]:
train_merged.dropna(inplace=True)
train_merged.to_csv("../processed_files/train_merged.csv", index=False)
log_dataset_to_wandb("train_merged", "../processed_files/train_merged.csv")
train_merged

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_type,day_locale,day_locale_name,day_description,day_transferred,oil_price,store_city,store_state,store_type,store_cluster
73062,73062,2013-02-11,1,AUTOMOTIVE,0.000,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
73063,73063,2013-02-11,1,BABY CARE,0.000,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
73064,73064,2013-02-11,1,BEAUTY,0.000,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
73065,73065,2013-02-11,1,BEVERAGES,172.000,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
73066,73066,2013-02-11,1,BOOKS,0.000,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,POULTRY,438.133,0,Holiday,Local,Riobamba,Fundacion de Riobamba,False,47.57,Quito,Pichincha,B,6
3054344,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Holiday,Local,Riobamba,Fundacion de Riobamba,False,47.57,Quito,Pichincha,B,6
3054345,3000885,2017-08-15,9,PRODUCE,2419.729,148,Holiday,Local,Riobamba,Fundacion de Riobamba,False,47.57,Quito,Pichincha,B,6
3054346,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Holiday,Local,Riobamba,Fundacion de Riobamba,False,47.57,Quito,Pichincha,B,6


In [16]:
test_merged.dropna(inplace=True)
test_merged.to_csv("../processed_files/test_merged.csv", index=False)
log_dataset_to_wandb("test_merged", "../processed_files/test_merged.csv")
test_merged

Unnamed: 0,id,date,store_nbr,family,onpromotion,day_type,day_locale,day_locale_name,day_description,day_transferred,oil_price,store_city,store_state,store_type,store_cluster
14256,3015144,2017-08-24,1,AUTOMOTIVE,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,D,13
14257,3015145,2017-08-24,1,BABY CARE,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,D,13
14258,3015146,2017-08-24,1,BEAUTY,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,D,13
14259,3015147,2017-08-24,1,BEVERAGES,26,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,D,13
14260,3015148,2017-08-24,1,BOOKS,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16033,3016921,2017-08-24,9,POULTRY,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,B,6
16034,3016922,2017-08-24,9,PREPARED FOODS,0,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,B,6
16035,3016923,2017-08-24,9,PRODUCE,3,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,B,6
16036,3016924,2017-08-24,9,SCHOOL AND OFFICE SUPPLIES,9,Holiday,Local,Ambato,Fundacion de Ambato,False,47.24,Quito,Pichincha,B,6


## Encode the string values

In [25]:
product_groups = train_merged.groupby("family")
product_groups.get_group("DAIRY").head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_type,day_locale,day_locale_name,day_description,day_transferred,oil_price,store_city,store_state,store_type,store_cluster
73070,73070,2013-02-11,1,DAIRY,151.0,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,D,13
73103,73103,2013-02-11,10,DAIRY,72.0,0,Holiday,National,Ecuador,Carnaval,False,97.01,Quito,Pichincha,C,15
73136,73136,2013-02-11,11,DAIRY,218.0,0,Holiday,National,Ecuador,Carnaval,False,97.01,Cayambe,Pichincha,B,6
73169,73169,2013-02-11,12,DAIRY,141.0,0,Holiday,National,Ecuador,Carnaval,False,97.01,Latacunga,Cotopaxi,C,15
73202,73202,2013-02-11,13,DAIRY,82.0,0,Holiday,National,Ecuador,Carnaval,False,97.01,Latacunga,Cotopaxi,C,15
