In [None]:
### NOTE: functions to get indicator data as a dataframe 
#were written based on indicator data inputs - please check formatting 
#of input csv to 

In [59]:
import pandas as pd
from datetime import datetime
import datetime as dt
filename = "financial_indicator_data/"

## GDP

In [60]:
# Read the GDP data
def get_gdp_df(path, start_date, end_date):
    """
    Returns a dataframe with date, indicator, and value columns:
    date: YYYY-MM-DD 
    indicator: GDP
    value: year-over-year percent change of monthly GDP
    
    input:
    path: filepath to input csv file containing GDP data
    start_date: string in YYYY-MM-DD format
    end_date:  string in YYYY-MM-DD format
    
    Assume start_date and end_date are contained within the input csv file

    
    """
    df = pd.read_csv(path)
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    

    ## select only all industries
    df = df.loc[df['North American Industry Classification System (NAICS)'].values == 'All industries [T001]']
    # Add dummy day as first day of the month
    df['REF_DATE'] = (df['REF_DATE'] + "-01")
    # Convert the column to datetime type from string type
    df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    #Scale adjusted to millions (so multiply by 1 million)
    df['VALUE'] = df['VALUE']*1000000
    # Create new dataframe with only required columns : date and gdp value
    gdp_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
    # Create a new column for indicator
    gdp_df.insert(loc=1, column='indicator', value="GDP")


    # create a new_column and sort by date
    gdp_df['percentage_change'] = 0.0
    gdp_df = gdp_df.sort_values('date')

    for i in range(0,gdp_df.shape[0]):
        date_column = gdp_df['date'][i]
        #print(date_column)
        if start_date <= date_column <= end_date:
            date_offset = date_column - pd.offsets.DateOffset(years=1)
    #    one year ago date column  is  date_offset)
            filtered_df = gdp_df[(gdp_df.date == date_offset)]
            #print(filtered_df)
            gdp_df['percentage_change'][i] = ( (gdp_df['value'][i]-filtered_df['value'])/filtered_df['value'])*100

    # Drop the data for less than 1 year
    for i in range(gdp_df.shape[0]):
        # Dates where we want to consider the data
        date_column = gdp_df['date'][i]    
        if start_date <= date_column <= end_date:
            pass
        else:
            gdp_df = gdp_df.drop(gdp_df[(gdp_df.date == date_column)].index)

    gdp_df = gdp_df.drop(columns=["value"])
    gdp_df = gdp_df.rename(columns={"percentage_change": "value"})
    
    #tests
    assert(gdp_df["date"].dtype == "datetime64[ns]")
    assert(gdp_df["indicator"].dtype == "object")
    assert(gdp_df["value"].dtype == "float64")
    
    return gdp_df



## TSX

In [62]:
def get_tsx_df(path):
    """
    Returns a dataframe with date, indicator, and value columns:
    date: YYYY-MM-DD 
    indicator: TSX
    value: TSX close value of that date
    
    input:
    path: filepath to input csv file containing TSX data

    Assume input csv starts and ends with the desired values
    Assume input csv has a 'Close' column
    """
    df = pd.read_csv(path)
    # Convert the column to datetime type from string type
    df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    # Create new dataframe with only required columns : date and Close values
    tsx_df = pd.concat([df['Date'], df['Close']], axis=1, keys=['date', 'value'])
    # Create a new column for indicator
    tsx_df.insert(loc=1, column='indicator', value="TSX")
    
    #tests
    assert(tsx_df["date"].dtype == "datetime64[ns]")
    assert(tsx_df["indicator"].dtype == "object")
    assert(tsx_df["value"].dtype == "float64")
    
    return tsx_df
    


## Mortgage rates (%)

In [64]:
def get_mortgage_df(path):
    """
    Returns a dataframe with date, indicator, and value columns:
    date: YYYY-MM-DD 
    indicator: TSX
    value: Total, funds advanced, residential mortgages, insured
    
    input:
    path: filepath to input csv file containing mortgage rate data

    Assume relevant information is contained within input csv rows where:
        "Unit of measure" column has value "Interest rate"
        "Components" column has value "Total, funds advanced, residential mortgages, insured"
    """
    
    df = pd.read_csv(path)
    ## select only "Interest rates" and "Total, funds advanced, residential mortgages, insured"
    df = df[(df['Unit of measure'] == "Interest rate" )& (df['Components'] == "Total, funds advanced, residential mortgages, insured")]
    # Add dummy day as first day of the month
    df['REF_DATE'] = (df['REF_DATE'] + "-01")
    # Convert the column to datetime type from string type
    df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    # Create new dataframe with only required columns : date and gdp value
    mortgage_rate_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
    # Create a new column for indicator
    mortgage_rate_df.insert(loc=1, column='indicator', value="mortgage_rate")
    
    #tests
    assert(mortgage_rate_df["date"].dtype == "datetime64[ns]")
    assert(mortgage_rate_df["indicator"].dtype == "object")
    assert(mortgage_rate_df["value"].dtype == "float64")
    
    return mortgage_rate_df

## Interest rates (%)

In [75]:
def get_interest_df(path):
    """
    Returns a dataframe with date, indicator, and value columns:
    Date: YYYY-MM-DD 
    indicator: interest_rate
    value: value of overnight target interest rate
    
    input:
    path: filepath to input csv file containing interest rate data

    Assume all rows in input csv are relevant
    Assume header is 11 lines
    """
    
    df = pd.read_csv(path, skiprows=11)
    df.head()
    boc_interest_rates_df = df
    boc_interest_rates_df.insert(loc=1, column='indicator', value="interest_rate")
    boc_interest_rates_df["Date"] = pd.to_datetime(boc_interest_rates_df["Date"], format="%Y-%m-%d")
    boc_interest_rates_df = boc_interest_rates_df.sort_values(by='Date')
    boc_interest_rates_df =boc_interest_rates_df.rename(columns={"V39079": "value"})
    
    #tests
    assert(boc_interest_rates_df["Date"].dtype == "datetime64[ns]")
    assert(boc_interest_rates_df["indicator"].dtype == "object")
    assert(boc_interest_rates_df["value"].dtype == "float64")
    return boc_interest_rates_df

## Employment rate(%)

In [77]:
def get_employment_df(path):
    """
    Returns a dataframe with date, indicator, and value columns:
    date: YYYY-MM-DD 
    indicator: interest_rate
    value: employment rates
    
    input:
    path: filepath to input csv file containing employment data

    Assume relevant information is contained within input csv rows where:
        "Sex" column has value "Both sexes"
        "Age group" has value "15 years and over"
        "GEO" column has value "Canada"
    """
    df = pd.read_csv(path)
    df = df[(df["Sex"] == "Both sexes" )& (df['Age group'] == "15 years and over") & (df['GEO'] == "Canada")]

    ## select only "Both sexes" 
    # df = df.loc[df['Sex'].values == "Both sexes" ]
    # Add dummy day as first day of the month
    df['REF_DATE'] = (df['REF_DATE'] + "-01")
    # Convert the column to datetime type from string type
    df['REF_DATE'] = df['REF_DATE'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    # Create new dataframe with only required columns : date and gdp value
    employment_df =  pd.concat([df['REF_DATE'], df['VALUE']], axis=1, keys=['date', 'value'])
    # Create a new column for indicator
    employment_df.insert(loc=1, column='indicator', value="employment")
    
    #tests
    assert(employment_df["date"].dtype == "datetime64[ns]")
    assert(employment_df["indicator"].dtype == "object")
    assert(employment_df["value"].dtype == "float64")
    
    return employment_df

## Housing

In [79]:
def get_housing_df(path):
    """
    Returns a dataframe with date, indicator, and value columns:
    date: YYYY-MM-DD 
    indicator: interest_rate
    value: percentage growth in the composite HPI value
    
    input:
    path: filepath to input csv file containing housing price data

    Assume relevant information is contained within rows that either:
        "year" column has value '2019', or 
        "year" column has value '2020'
    """
    
    df = pd.read_csv(path)

    df['Date'] = pd.to_datetime(df['Date'], format='%b %Y')
    housing_price_df =  pd.concat([df['Date'], df['Composite_HPI']], axis=1, keys=['date', 'value'])
    housing_price_df.insert(loc=1, column='indicator', value="housing_price")
    housing_price_df['year'] = housing_price_df['date'].apply(lambda x: x.year)

    housing_price_df = housing_price_df[(housing_price_df["year"] == 2019 ) | (housing_price_df["year"] == 2020 ) ]

    housing_price_df =housing_price_df.drop(columns = ['year'])

    housing_price_df.reset_index(drop=True, inplace=True)

    housing_price_df['percentage_change'] = 0.0

    for i in range(1,housing_price_df.shape[0]):
        housing_price_df['percentage_change'][i] = ((housing_price_df['value'][i] - housing_price_df['value'][i-1])/housing_price_df['value'][i])*100

    housing_price_df = housing_price_df.drop(columns=["value"])
    housing_price_df = housing_price_df.rename(columns={"percentage_change": "value"})
    
    #tests
    assert(housing_price_df["date"].dtype == "datetime64[ns]")
    assert(housing_price_df["indicator"].dtype == "object")
    assert(housing_price_df["value"].dtype == "float64")
    
    return housing_price_df

In [81]:
def df_to_merge(df, indicator_name):
    """returns a two column dataframe from a three column dataframe
    
    order of columns -input: date, type of indicator, value of indicator
    order of columns - output: date, value of indicator w/ indicator in column name
    
    """

    df_to_merge = df
    val_name = "value_" + indicator_name
    df_to_merge.columns = ['date', 'ind', val_name]
    df_to_merge = df_to_merge.drop(columns=["ind"])
    
    return df_to_merge


## CODE TO COMBINE DATAFRAME

In [83]:
def main():
    
    #create six individual dataframes
    gdp_path = filename + "gdp.csv"
    gdp_start = "2019-04-01"
    gdp_end = "2020-03-01"
    gdp_df = get_gdp_df(gdp_path, gdp_start, gdp_end)
    
    tsx_path = filename + 'tsx.csv'
    tsx_df = get_tsx_df(tsx_path)
    
    mortgage_rate_path = filename + 'mortgage_rates.csv'
    mortgage_rate_df = get_mortgage_df(mortgage_rate_path)
    
    employment_path = filename + 'employment.csv'
    employment_df = get_employment_df(employment_path)
    
    interest_rates_path = filename + 'interest_rates.csv'
    boc_interest_rates_df = get_interest_df(interest_rates_path)
    
    housing_path = filename + 'housing_prices.csv'
    housing_price_df = get_housing_df(housing_path)
    
    #create dataframes with value_indicator column name
    gdp_to_merge = df_to_merge(gdp_df, "GDP")
    tsx_to_merge = df_to_merge(tsx_df, "TSX")
    mort_to_merge = df_to_merge(mortgage_rate_df, "mortgage_rates")
    employment_to_merge = df_to_merge(employment_df, "employment")
    housing_to_merge = df_to_merge(housing_price_df, "housing_prices")
    intr_to_merge = df_to_merge(boc_interest_rates_df, "interest_rates")
    
    #merge dataframe into full dataframe
    total_df = None
    total_df = gdp_to_merge.merge(tsx_to_merge, how="outer")
    total_df = total_df.merge(mort_to_merge, how="outer")
    total_df = total_df.merge(employment_to_merge, how="outer")
    total_df = total_df.merge(housing_to_merge, how="outer")
    total_df = total_df.merge(intr_to_merge, how="outer")
    total_df = total_df.sort_values(by='date')
    
    
    #export_to_csv
    out_filename= "combined_indicator_data.csv"
    total_df.to_csv(out_filename, index = False)
    print("FINANCIAL INDICATOR FILE CREATED: ", out_filename)
    print("ROWS, COLUMNS IN FILE: ", total_df.shape)
    
    
    
main()

FINANCIAL INDICATOR FILE CREATED:  combined_indicator_data.csv
ROWS, COLUMNS IN FILE:  (272, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
#MERGE ALL DATA
total_df = None
total_df = gdp_to_merge.merge(tsx_to_merge, how="outer")
total_df = total_df.merge(mort_to_merge, how="outer")
total_df = total_df.merge(employment_to_merge, how="outer")
total_df = total_df.merge(housing_to_merge, how="outer")
total_df = total_df.merge(intr_to_merge, how="outer")
total_df = total_df.sort_values(by='date')
total_df

Unnamed: 0,date,value_GDP,value_TSX,value_mortgage_rates,value_employment,value_housing_prices,value_interest_rates
12,2019-01-01,,15540.599609,3.84,60.8,0.000000,
13,2019-02-01,,15999.000000,3.82,61.1,0.350570,
14,2019-03-01,,16102.099609,3.76,61.0,0.696258,
0,2019-04-01,1.864947,16580.699219,3.59,61.6,0.648508,
1,2019-05-01,1.697702,16037.500000,3.45,62.7,0.301724,
...,...,...,...,...,...,...,...
267,2020-05-26,,,,,,0.25
268,2020-05-27,,,,,,0.25
269,2020-05-28,,,,,,0.25
270,2020-05-29,,,,,,0.25


In [30]:
out_filename= "combined_indicator_data.csv"
total_df.to_csv(out_filename, index = False)

In [None]:
#GDP RANGE: 2019-04-01 - 2020-03-01
#TSX RANGE: 2019-01-01- 2020-05-01
#MORTGAGE RANGE: 2019-01-01- 2020-03-01
#EMPLOYMENT RANGE: 2019-01-01- 2020-04-01
#HOUSING PRICE RANGE: 2019-01-01- 2020-04-01
#INTEREST RATE: 2019-05-31 - 2020-06-01