In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sklearn
from scipy.stats import randint, uniform
import os

pd.set_option('display.max_columns', None)
os.getcwd()

'c:\\Users\\parke\\data_work\\datascience\\machine_learning\\microbusiness_forecasting'

In [3]:
# model selection, cross validation and hyperparam tuning
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV, RandomizedSearchCV

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, RobustScaler

# imputation
from sklearn.impute import SimpleImputer, KNNImputer

# pipelines
from sklearn.pipeline import make_pipeline

# custom transformers 
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer

# dummy model for baseline comparison
from sklearn.dummy import DummyRegressor 

# classificaion model metrics
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# GoDaddy - Microbusiness Density Forecasting
__[Kaggle Competition Link](https://www.kaggle.com/competitions/godaddy-microbusiness-density-forecasting)__


## Data Description
train.csv

- row_id - An ID code for the row.
- cfips - A unique identifier for each county using the Federal Information Processing System. The first two digits correspond to the state FIPS code, while the following 3 represent the county.
- county_name - The written name of the county.
- state_name - The name of the state.
- first_day_of_month - The date of the first day of the month.
- microbusiness_density - Microbusinesses per 100 people over the age of 18 in the given county. This is the target variable. The population figures used to calculate the density are on a two-year lag due to the pace of update provided by the U.S. Census Bureau, which provides the underlying population data annually. 2021 density figures are calculated using 2019 population figures, etc.
- active - The raw count of microbusinesses in the county. Not provided for the test set.


sample_submission.csv: A valid sample submission. This file will remain unchanged throughout the competition.
- row_id - An ID code for the row.
-microbusiness_density - The target variable.


test.csv: Metadata for the submission rows. This file will remain unchanged throughout the competition.
- row_id - An ID code for the row.
- cfips - A unique identifier for each county using the Federal Information Processing System. The first two digits correspond to the state FIPS code, while the following 3 represent the county.
- first_day_of_month - The date of the first day of the month.

revealed_test.csv: During the submission period, only the most recent month of data will be used for the public leaderboard. Any test set data older than that will be published in revealed_test.csv, closely following the usual data release cycle for the microbusiness report. We expect to publish one copy of revealed_test.csv in mid February. This file's schema will match train.csv.

census_starter.csv: Examples of useful columns from the Census Bureau's American Community Survey (ACS) at data.census.gov. The percentage fields were derived from the raw counts provided by the ACS. All fields have a two year lag to match what information was avaiable at the time a given microbusiness data update was published.

## Data import and initial look

Goal is to make 3135 individual predictions for each month ranging from 1 to 8 months into the future. We have 39 months of training data: 3.25 years

In [85]:
train_0 = pd.read_csv('train.csv')
test_0 = pd.read_csv('test.csv')

census_starter = pd.read_csv('census_starter.csv')
revealed_test = pd.read_csv('revealed_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

#train_0.info()


train_0 = (train_0.rename(columns = {'first_day_of_month':'month', 'microbusiness_density':'mbd'})
                    .astype({'state':'category', 'month':'datetime64'})
            )       

test_0 = (test_0.rename(columns = {'first_day_of_month':'month'})
                    .astype({'month':'datetime64'})
            )       

train_0.head()

Unnamed: 0,row_id,cfips,county,state,month,mbd,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


Goal is to make 3135 individual predictions (one for each county) for each month ranging from 1 to 8 months into the future

In [67]:
test_0.groupby('month', as_index = False)['row_id'].nunique()

Unnamed: 0,month,row_id
0,2022-11-01,3135
1,2022-12-01,3135
2,2023-01-01,3135
3,2023-02-01,3135
4,2023-03-01,3135
5,2023-04-01,3135
6,2023-05-01,3135
7,2023-06-01,3135


In [107]:
print("First month of training data:", train_0['month'].min())
print("Last month of training data:", train_0['month'].max())
print("Months of training data:", train_0.month.nunique())
print("Number of unique counties:", train_0.cfips.nunique())

First month of training data: 2019-08-01 00:00:00
Last month of training data: 2022-10-01 00:00:00
Months of training data: 39
Number of unique counties: 3135


### Taking a look at a random sample of counties

In [113]:
counties = train_0['cfips'].unique()
sample_counties_30 = np.random.choice(counties, size= 30, replace = False)
sample_counties_500 = np.random.choice(counties, size = 500, replace = False)

train_sample_30 = train_0.query("cfips in @sample_counties_30")
train_sample_500 = train_0.query("cfips in @sample_counties_500")


In [98]:
train_sample_30 = train_0.query("cfips in @sample_counties_30")

px.line(train_sample_30, 
        x = 'month',
        y = 'mbd',
        color = 'cfips',
        height= 600,
        width = 1000,
        title = "Target Variable trajectory over training period for 30 random counties").show()

In [97]:
px.box(train_sample_30, 
        x = 'cfips',
        y = 'mbd',
        height= 600,
        width = 1000,
        title = "Variation in Target Variable over training period for 30 random counties (cfips)").show()

In [89]:
def get_autocorr(df, lag):
    autocorr = df.groupby('cfips', as_index = False)['mbd'].apply(lambda x: x.autocorr(lag=lag)).rename(columns = {'mbd':lag})
    return(autocorr[lag])


In [121]:
autocorrs_30 = pd.DataFrame()

for i in range(1, 18):
    x = get_autocorr(df = train_sample_30, lag = i)
    autocorrs_30 = pd.concat([autocorrs_30,x], axis = 1)

autocorrs_30['cfips'] = sample_counties_30
autocorrs_30_long = autocorrs_30.melt(id_vars = 'cfips', var_name = 'lag', value_name='autocorrelation')

px.line(autocorrs_30_long,
        x = 'lag',
        y = 'autocorrelation',
        color = 'cfips',
        title = "Autocorrelation for differnet lags for 30 random counties",
        height= 500,
        width = 1200).show()

In [119]:
autocorrs_500 = pd.DataFrame()

for i in range(1, 12):
    x = get_autocorr(df = train_sample_500, lag = i)
    autocorrs_500 = pd.concat([autocorrs_500,x], axis = 1)

autocorrs_500['cfips'] = sample_counties_500
autocorrs_500_long = autocorrs_500.melt(id_vars = 'cfips', var_name = 'lag', value_name='autocorrelation')

px.box(autocorrs_500_long,
        x = 'lag',
        y = 'autocorrelation',
        title = "Variation in the degree of autocorrelation of MBD for different lags for 500 random counties",
        height= 500,
        width = 1200).add_hline(y=0.0).show()

In [146]:
mbd_variance = train_0.groupby('cfips')['mbd'].agg(['var','std']).reset_index()
mbd_variance.head()

px.histogram(mbd_variance, 
                x = 'std',
                nbins = 300,
                width = 1000,
                title = "Histogram of the variance in MBD: each observation is the std_dev of MDB of a county during the training period").show()

## Looking at the census data provided
- pct_bb_[year] - The percentage of households in the county with access to broadband of any type. Derived from ACS table B28002: PRESENCE AND TYPES OF INTERNET SUBSCRIPTIONS IN HOUSEHOLD.
- cfips - The CFIPS code.
- pct_college_[year] - The percent of the population in the county over age 25 with a 4-year college degree. Derived from ACS table S1501: EDUCATIONAL ATTAINMENT.
- pct_foreign_born_[year] - The percent of the population in the county born outside of the United States. Derived from ACS table DP02: SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES.
- pct_it_workers_[year] - The percent of the workforce in the county employed in information related industries. Derived from ACS table S2405: INDUSTRY BY OCCUPATION FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER.
- median_hh_inc_[year] - The median household income in the county. Derived from ACS table S1901: INCOME IN THE PAST 12 MONTHS (IN 2021 INFLATION-ADJUSTED DOLLARS).


In [5]:
census_starter.head()

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,pct_college_2021,pct_foreign_born_2017,pct_foreign_born_2018,pct_foreign_born_2019,pct_foreign_born_2020,pct_foreign_born_2021,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,20.6,3.2,3.4,3.7,3.4,3.5,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,6.7,2.7,2.5,2.7,2.6,2.6,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0
3,62.0,66.1,69.2,76.1,74.6,1007,8.1,7.6,6.5,7.4,7.9,1.0,1.4,1.5,1.6,1.1,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
4,65.8,68.5,73.0,79.6,81.0,1009,8.7,8.1,8.6,8.9,9.3,4.5,4.4,4.5,4.4,4.5,1.3,1.4,0.9,1.1,0.9,47412,48695.0,49358,48922.0,52830.0


In [54]:
census_starter.isna().sum()

pct_bb_2017              0
pct_bb_2018              0
pct_bb_2019              0
pct_bb_2020              1
pct_bb_2021              1
cfips                    0
pct_college_2017         0
pct_college_2018         0
pct_college_2019         0
pct_college_2020         1
pct_college_2021         1
pct_foreign_born_2017    0
pct_foreign_born_2018    0
pct_foreign_born_2019    0
pct_foreign_born_2020    1
pct_foreign_born_2021    1
pct_it_workers_2017      0
pct_it_workers_2018      1
pct_it_workers_2019      0
pct_it_workers_2020      1
pct_it_workers_2021      1
median_hh_inc_2017       0
median_hh_inc_2018       1
median_hh_inc_2019       0
median_hh_inc_2020       2
median_hh_inc_2021       2
dtype: int64

#### Convert to tidy dataset

In [49]:
def census_fixer(metric_string):
    # grab the relevant columns
    wide  = census_starter.filter(regex = metric_string).copy()
    # rename for the year of the observation
    wide.columns = wide.columns.str[-4:]
    # add on the cfips identifier
    wide['cfips'] = cfips
    # convert to a tidy dataset
    finished = wide.melt(id_vars='cfips', var_name='year', value_name=metric_string)
    return(finished)



In [79]:
pct_bb = census_fixer(metric_string='pct_bb')
pct_college = census_fixer('pct_college')
pct_foreign_born = census_fixer('pct_foreign_born')
pct_it_workers = census_fixer('pct_it_workers')
median_hh_inc  = census_fixer('median_hh_inc')

print(pct_bb.size,
pct_college.size, 
pct_foreign_born.size,
pct_it_workers.size,
median_hh_inc.size )

census_yearly = (pct_bb.merge(pct_college, on = ['cfips','year'])
                    .merge(pct_foreign_born, on = ['cfips','year'])
                    .merge(pct_it_workers, on = ['cfips','year'])
                    .merge(median_hh_inc, on = ['cfips','year'])
                )

census_yearly.head()

47130 47130 47130 47130 47130


Unnamed: 0,cfips,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc
0,1001,2017,76.6,14.5,2.1,1.3,55317.0
1,1003,2017,74.5,20.4,3.2,1.4,52562.0
2,1005,2017,57.2,7.6,2.7,0.5,33368.0
3,1007,2017,62.0,8.1,1.0,1.2,43404.0
4,1009,2017,65.8,8.7,4.5,1.3,47412.0


In [81]:
census_sample = census_yearly.query("cfips in @sample_counties")

px.line(census_sample.melt(id_vars=['cfips','year']),
    x = 'year',
    y = 'value',
    color = 'cfips',
    facet_col= 'variable').update_yaxes(matches=None).show()