# Preprocessing the Data

# Imports

## Modules and Packages

In [3]:
import pandas as pd

## Data

In [55]:
ice_cream_data = pd.read_csv('../data/ice_cream_data.csv')
cpih_data = pd.read_csv('../data/cpih_data.csv', header=7).rename({'Important notes':'date', 'Unnamed: 1':'cpih'}, axis=1)

In [56]:
# Only keeping monthly cpih rows from 2020 - 2023
# Converting weird format to datetime too

cpih_data = cpih_data[cpih_data.date.str.contains('202\w\s\w\w\w')]
cpih_data['year_month'] = pd.to_datetime(cpih_data['date']).dt.to_period('M')

# Creating the year month format in ice cream data for joining too

ice_cream_data['year_month'] = pd.to_datetime(ice_cream_data['DATE']).dt.to_period('M')

In [101]:
# Joining the data

df = ice_cream_data.merge(cpih_data[['cpih', 'year_month']], how='inner', on='year_month').drop('year_month', axis=1).rename(columns=str.lower)

In [102]:
# Checking for nulls

if df.isna().any().any():
    print('Nulls!')

# Feature Engineering

In [103]:
df.head()

Unnamed: 0,date,day_of_week,weather_region_cd,maximum_feels_like_temperature,rainfall,sunshine_td,avg_ice_cream_price,ice_cream_sold,cpih
0,2022-03-27,0,1,15.4,0.2,12.1,2.704702,1498.0,6.2
1,2022-03-27,0,10,12.7,0.0,10.6,2.704702,2285.0,6.2
2,2022-03-27,0,11,15.1,0.0,10.2,2.704702,5509.0,6.2
3,2022-03-27,0,12,10.7,0.0,5.3,2.704702,7776.0,6.2
4,2022-03-27,0,13,13.1,0.0,8.6,2.704702,4903.0,6.2


In [104]:
# Finding and encoding month of year

df['month'] = pd.to_datetime(df['date']).dt.month

In [105]:
# Ensuring all the columns (bar date) are floats

float_cols = list(df.columns)
float_cols.remove('date')

df[float_cols] = df[float_cols].astype('float')

In [107]:
# Checking for nulls

if df.isna().any().any():
    print('Nulls!')

# Exporting the csv of the data

In [108]:
df.to_csv('../data/causal_df.csv')