In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Group Project 3
# Final project
# Import unemployment stats from 2000 to 2019
# (c) Boris Smirnov

In [2]:
# Depedences
import pandas as pd

In [3]:
# Constants
input_csv = 'src/1410028701-noSymbol.csv'
output_csv = 'unemployment_202006.csv'

In [4]:
# Dataset with per candidate per riding results
df = pd.read_csv(input_csv)

In [7]:
# Save Canada wide stats
canada_df = df[df['Geography'] == 'Canada'].copy()

# Append missing data using Canada wide values as default
for prov in ['Yukon', 'Northwest Territories', 'Nunavut']:
    canada_df['Geography'] = prov
    df = df.append(canada_df, ignore_index=True)

# Remove Canada wide stats
df.drop(index=0, inplace=True)

# Remove 'Reference period'
del df['Reference period']

# Replace province name with province Id and rename the column accordingly
prov_name_id = {
    'Alberta': 48,
    'British Columbia': 59,
    'Nunavut': 62,
    'Manitoba': 46,
    'New Brunswick': 13,
    'Newfoundland and Labrador': 10,
    'Nova Scotia': 12,
    'Ontario': 35,
    'Prince Edward Island': 11,
    'Quebec': 24,
    'Saskatchewan': 47,
    'Yukon': 60,
    'Northwest Territories': 61
}

df['Geography'] = df['Geography'].map(prov_name_id)
df.rename(columns={'Geography': 'Prov Id'}, inplace=True)

# Save the result
df.to_csv(output_csv, index=False)

df

Unnamed: 0,Prov Id,15 to 24 years,25 to 54 years,55 years and over
1,11,22.4,8.3,5.6
2,12,25.0,9.5,10.3
3,13,20.2,7.0,6.6
4,24,24.0,7.3,8.7
5,35,30.5,9.5,7.9
6,46,19.5,8.2,5.5
7,47,21.1,9.1,7.2
8,48,29.7,11.9,13.4
9,59,27.8,10.0,8.4
10,60,27.5,9.3,8.8


In [8]:
import numpy as np

# Returns a string with generates SQL create table script for a given data frame
def gen_create_table(df, table_name):
    s = f"-- drop table {table_name};\n\ncreate table {table_name} (\n"

    for idx, col in enumerate(df.columns):
        if idx > 0:
            s += ",\n"
        col_type = df[col].dtype
        
        if col_type == np.object:
            sql_type = 'varchar'
        elif col_type == np.float64:
            sql_type = 'float'
        elif col_type == np.int64:
            sql_type = 'integer'
        else:
            print(f"Unknown type: {col_type}")
            break
        
        s += f"\t\"{col}\" {sql_type}"
        
    s += "\n);\n"
    return s

In [10]:
# Generate create table sql script
table_name = "unemployment_202006"
query = gen_create_table(df, table_name)
with open('sql/' + table_name + ".sql", "w") as sql_file:
    sql_file.write(query)
