In [8]:
import pandas as pd
from datetime import datetime
import datetime as dt
filename = "indicator_data_raw/"

## GDP

In [9]:

# Read the GDP data
df = pd.read_csv(filename + "GDP_full.csv")

## select only all industries
df = df.loc[df['North American Industry Classification System (NAICS)'].values == 'All industries [T001]']
# Add dummy day as first day of the month
df['REF_DATE'] = (df['REF_DATE'] + "-01")
# Convert the column to datetime type from string type
df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
#Scale adjusted to millions (so multiply by 1 million)
df['VALUE'] = df['VALUE']*1000000
# Create new dataframe with only required columns : date and gdp value
gdp_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
# Create a new column for indicator
gdp_df.insert(loc=1, column='indicator', value="GDP")


# create a new_column and sort by date
gdp_df['percentage_change'] = 0.0
gdp_df = gdp_df.sort_values('date')

for i in range(0,gdp_df.shape[0]):
    # Dates where we want to consider the data
    start_date = datetime.strptime("2019-04-01", "%Y-%m-%d")
    end_date = datetime.strptime("2020-03-01", "%Y-%m-%d")
    date_column = gdp_df['date'][i]
    #print(date_column)
    if start_date <= date_column <= end_date:
        date_offset = date_column - pd.offsets.DateOffset(years=1)
#    one year ago date column  is  date_offset)
        filtered_df = gdp_df[(gdp_df.date == date_offset)]
        #print(filtered_df)
        gdp_df['percentage_change'][i] = ( (gdp_df['value'][i]-filtered_df['value'])/filtered_df['value'])*100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
# Drop the data for less than 1 year
for i in range(gdp_df.shape[0]):
    # Dates where we want to consider the data
    start_date = datetime.strptime("2019-04-01", "%Y-%m-%d")
    end_date = datetime.strptime("2020-03-01", "%Y-%m-%d")
    date_column = gdp_df['date'][i]    
    if start_date <= date_column <= end_date:
        pass
    else:
        gdp_df = gdp_df.drop(gdp_df[(gdp_df.date == date_column)].index)

gdp_df = gdp_df.drop(columns=["value"])
gdp_df = gdp_df.rename(columns={"percentage_change": "value"})
gdp_df

Unnamed: 0,date,indicator,value
12,2019-04-01,GDP,1.864947
13,2019-05-01,GDP,1.697702
14,2019-06-01,GDP,1.881365
15,2019-07-01,GDP,1.644783
16,2019-08-01,GDP,1.690147
17,2019-09-01,GDP,1.684217
18,2019-10-01,GDP,1.474645
19,2019-11-01,GDP,1.774822
20,2019-12-01,GDP,2.061446
21,2020-01-01,GDP,1.858127


## TSX

In [11]:
df = pd.read_csv(filename + 'tsx.csv')
# Convert the column to datetime type from string type
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
# Create new dataframe with only required columns : date and Close values
tsx_df = pd.concat([df['Date'], df['Close']], axis=1, keys=['date', 'value'])
# Create a new column for indicator
tsx_df.insert(loc=1, column='indicator', value="TSX")

In [12]:
tsx_df

Unnamed: 0,date,indicator,value
0,2019-01-01,TSX,15540.599609
1,2019-02-01,TSX,15999.0
2,2019-03-01,TSX,16102.099609
3,2019-04-01,TSX,16580.699219
4,2019-05-01,TSX,16037.5
5,2019-06-01,TSX,16382.200195
6,2019-07-01,TSX,16406.599609
7,2019-08-01,TSX,16442.099609
8,2019-09-01,TSX,16658.599609
9,2019-10-01,TSX,16483.199219


## Mortgage rates (%)

In [13]:
df = pd.read_csv(filename + 'mortgage_rates.csv')
## select only "Interest rates" and "Total, funds advanced, residential mortgages, insured"
df = df[(df['Unit of measure'] == "Interest rate" )& (df['Components'] == "Total, funds advanced, residential mortgages, insured")]
# Add dummy day as first day of the month
df['REF_DATE'] = (df['REF_DATE'] + "-01")
# Convert the column to datetime type from string type
df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
# Create new dataframe with only required columns : date and gdp value
mortgage_rate_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
# Create a new column for indicator
mortgage_rate_df.insert(loc=1, column='indicator', value="mortgage_rate")
mortgage_rate_df.shape

(15, 3)

In [14]:
mortgage_rate_df

Unnamed: 0,date,indicator,value
15,2019-01-01,mortgage_rate,3.84
16,2019-02-01,mortgage_rate,3.82
17,2019-03-01,mortgage_rate,3.76
18,2019-04-01,mortgage_rate,3.59
19,2019-05-01,mortgage_rate,3.45
20,2019-06-01,mortgage_rate,3.33
21,2019-07-01,mortgage_rate,3.22
22,2019-08-01,mortgage_rate,3.16
23,2019-09-01,mortgage_rate,3.12
24,2019-10-01,mortgage_rate,3.12


## Interest rates (%)

In [16]:
#USE FOR OVERNIGHT MARKET FINANCING - Jan 2019 - May 2020

df = pd.read_csv(filename + "interest_rates_statscan.csv")
df = df[(df["Financial market statistics"] == "Overnight money market financing" ) ]
df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
# Replace NaNs with 0
df = df.fillna(0)
# Create new dataframe with only required columns 
omm_interest_rates_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
omm_interest_rates_df.head()
# Create a new column for indicator
omm_interest_rates_df.insert(loc=1, column='indicator', value="interest_rate")
omm_interest_rates_df

Unnamed: 0,date,indicator,value
0,2019-01-01,interest_rate,0.0000
1,2019-01-02,interest_rate,1.7529
2,2019-01-03,interest_rate,1.7518
3,2019-01-04,interest_rate,1.7500
4,2019-01-05,interest_rate,0.0000
...,...,...,...
510,2020-05-25,interest_rate,0.2402
511,2020-05-26,interest_rate,0.2393
512,2020-05-27,interest_rate,0.2386
513,2020-05-28,interest_rate,0.2358


In [17]:
#USE FOR BANK OF CANADA TARGET RATES - end of May 2019 - June 2020

df = pd.read_csv(filename + 'interest-rates_bank-of-canada_2019-may-to-2020-june.csv',
                skiprows=11)
df.head()
boc_interest_rates_df = df
boc_interest_rates_df.insert(loc=1, column='indicator', value="interest_rate")
boc_interest_rates_df["Date"] = pd.to_datetime(boc_interest_rates_df["Date"], format="%Y-%m-%d")
boc_interest_rates_df = boc_interest_rates_df.sort_values(by='Date')
boc_interest_rates_df

Unnamed: 0,Date,indicator,V39079
261,2019-05-31,interest_rate,1.75
260,2019-06-03,interest_rate,1.75
259,2019-06-04,interest_rate,1.75
258,2019-06-05,interest_rate,1.75
257,2019-06-06,interest_rate,1.75
...,...,...,...
4,2020-05-26,interest_rate,0.25
3,2020-05-27,interest_rate,0.25
2,2020-05-28,interest_rate,0.25
1,2020-05-29,interest_rate,0.25


## Employment rate(%)

In [18]:
df = pd.read_csv(filename + 'employment.csv')
df = df[(df["Sex"] == "Both sexes" )& (df['Age group'] == "15 years and over") & (df['GEO'] == "Canada")]

## select only "Both sexes" 
# df = df.loc[df['Sex'].values == "Both sexes" ]
# Add dummy day as first day of the month
df['REF_DATE'] = (df['REF_DATE'] + "-01")
# Convert the column to datetime type from string type
df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
# Create new dataframe with only required columns : date and gdp value
employment_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
# Create a new column for indicator
employment_df.insert(loc=1, column='indicator', value="employment")
employment_df.shape

(16, 3)

In [19]:
employment_df

Unnamed: 0,date,indicator,value
0,2019-01-01,employment,60.8
1,2019-02-01,employment,61.1
2,2019-03-01,employment,61.0
3,2019-04-01,employment,61.6
4,2019-05-01,employment,62.7
5,2019-06-01,employment,63.2
6,2019-07-01,employment,62.7
7,2019-08-01,employment,62.7
8,2019-09-01,employment,62.4
9,2019-10-01,employment,62.3


## Housing

In [20]:
df = pd.read_csv(filename + 'housing_prices.csv')

df['Date'] = pd.to_datetime(df['Date'], format='%b %Y')
housing_price_df =  pd.concat([df['Date'], df['Composite_HPI']], axis=1, keys=['date', 'value'])
housing_price_df.insert(loc=1, column='indicator', value="housing_price")
housing_price_df['year'] = housing_price_df['date'].apply(lambda x: x.year)

housing_price_df = housing_price_df[(housing_price_df["year"] == 2019 ) | (housing_price_df["year"] == 2020 ) ]

housing_price_df =housing_price_df.drop(columns = ['year'])

housing_price_df.reset_index(drop=True, inplace=True)

housing_price_df['percentage_change'] = 0.0

for i in range(1,housing_price_df.shape[0]):
    housing_price_df['percentage_change'][i] = ((housing_price_df['value'][i] - housing_price_df['value'][i-1])/housing_price_df['value'][i])*100
    
housing_price_df = housing_price_df.drop(columns=["value"])
housing_price_df = housing_price_df.rename(columns={"percentage_change": "value"})
housing_price_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,date,indicator,value
0,2019-01-01,housing_price,0.0
1,2019-02-01,housing_price,0.35057
2,2019-03-01,housing_price,0.696258
3,2019-04-01,housing_price,0.648508
4,2019-05-01,housing_price,0.301724
5,2019-06-01,housing_price,0.215054
6,2019-07-01,housing_price,0.042992
7,2019-08-01,housing_price,0.085911
8,2019-09-01,housing_price,0.214316
9,2019-10-01,housing_price,0.341734


## CODE TO COMBINE DATAFRAME

In [22]:
def df_to_merge(df, indicator_name):
    """returns a two column dataframe from a three column dataframe
    
    order of columns -input: date, type of indicator, value of indicator
    order of columns - output: date, value of indicator w/ indicator in column name
    
    """

    df_to_merge = df
    val_name = "value_" + indicator_name
    df_to_merge.columns = ['date', 'ind', val_name]
    df_to_merge = df_to_merge.drop(columns=["ind"])
    
    return df_to_merge


In [23]:
#MONTHLY DATA 
gdp_to_merge = df_to_merge(gdp_df, "GDP")
tsx_to_merge = df_to_merge(tsx_df, "TSX")
mort_to_merge = df_to_merge(mortgage_rate_df, "mortgage_rates")
employment_to_merge = df_to_merge(employment_df, "employment")
housing_to_merge = df_to_merge(housing_price_df, "housing_prices")

In [24]:
#DAILY DATA - interest rate data

intr_to_merge = df_to_merge(boc_interest_rates_df, "interest_rates")

#FOR overnight money market values (2019 Jan - 2020 May, plz uncomment following line
#intr_to_merge = df_to_merge(omm_interest_rates_df, "interest_rates") 

In [25]:
#MERGE ALL DATA
total_df = None
total_df = gdp_to_merge.merge(tsx_to_merge, how="outer")
total_df = total_df.merge(mort_to_merge, how="outer")
total_df = total_df.merge(employment_to_merge, how="outer")
total_df = total_df.merge(housing_to_merge, how="outer")
total_df = total_df.merge(intr_to_merge, how="outer")
total_df = total_df.sort_values(by='date')
total_df

Unnamed: 0,date,value_GDP,value_TSX,value_mortgage_rates,value_employment,value_housing_prices,value_interest_rates
12,2019-01-01,,15540.599609,3.84,60.8,0.000000,
13,2019-02-01,,15999.000000,3.82,61.1,0.350570,
14,2019-03-01,,16102.099609,3.76,61.0,0.696258,
0,2019-04-01,1.864947,16580.699219,3.59,61.6,0.648508,
1,2019-05-01,1.697702,16037.500000,3.45,62.7,0.301724,
...,...,...,...,...,...,...,...
267,2020-05-26,,,,,,0.25
268,2020-05-27,,,,,,0.25
269,2020-05-28,,,,,,0.25
270,2020-05-29,,,,,,0.25


In [30]:
out_filename= "combined_indicator_data.csv"
total_df.to_csv(out_filename, index = False)

In [None]:
#GDP RANGE: 2019-04-01 - 2020-03-01
#TSX RANGE: 2019-01-01- 2020-05-01
#MORTGAGE RANGE: 2019-01-01- 2020-03-01
#EMPLOYMENT RANGE: 2019-01-01- 2020-04-01
#HOUSING PRICE RANGE: 2019-01-01- 2020-04-01
#INTEREST RATE: 2019-05-31 - 2020-06-01