In [None]:
import os
import gc
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Config:
    _base_dir = '/kaggle/input/godaddy-microbusiness-density-forecasting'
    _train_dir = os.path.join(_base_dir, 'train.csv')
    _test_dir = os.path.join(_base_dir, 'test.csv')
    _rev_test_dir = os.path.join(_base_dir, 'revealed_test.csv')
    _cen_dir = os.path.join(_base_dir, 'census_starter.csv')
    _sample_submission = os.path.join(_base_dir, 'sample_submission.csv')

In [None]:
train = pd.read_csv(Config._train_dir, parse_dates = ['first_day_of_month'])
reaveal_test = pd.read_csv(Config._rev_test_dir, parse_dates = ['first_day_of_month'])
train = train = pd.concat([train, reaveal_test]).sort_values(by=['cfips','first_day_of_month']).reset_index(drop = True)
train = train.assign(month = train.first_day_of_month.dt.month, year = train.first_day_of_month.dt.year)
train['tr_te_spl'] = 0

test = pd.read_csv(Config._test_dir, parse_dates = ['first_day_of_month'])
test['tr_te_spl'] = 1
drop_index = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
test = test.loc[~drop_index,:]
test = test.assign(month = test.first_day_of_month.dt.month, year = test.first_day_of_month.dt.year)

train_test = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True).drop(columns = ['first_day_of_month'])
train_test['county'] = train_test.groupby(by = 'cfips')['county'].ffill()
train_test['state'] = train_test.groupby(by = 'cfips')['state'].ffill()

sample_submission = pd.read_csv(Config._sample_submission)
m = [i for i in reaveal_test['row_id'].values]
sample_submission.set_index('row_id', inplace = True)
sample_submission.loc[m, 'microbusiness_density'] = reaveal_test.set_index('row_id').loc[m, 'microbusiness_density']

In [None]:
def stats(df): return (df
                           .groupby('cfips')['microbusiness_density']
                           .agg(['max', 'min', 'count', 'median', 'mean', 'std', 'var', 'first', 'last', lambda x: pd.Series.mode(x)[0]])
                           .reset_index()
                           .rename(columns = {'<lambda_0>': 'mode'})
                      )

map_vls = stats(train)
map_vls = map_vls.sort_values(by = 'std', ascending = False).reset_index(drop = True)
map_vls.head()

In [None]:
n_cfips = len(map_vls)
sample = .5
filter_len = int(n_cfips * sample)

hv_cfips = list(map_vls.loc[:filter_len, 'cfips'].values)

In [None]:
def normalizer(series: pd.Series = None): return (series - series.min()) / (series.max() - series.min())
def denormalizer(norm_series: pd.Series = None): return norm_series * (max_val - min_val) + min_val
def flatten(l: list = None): return [item for sublist in l for item in sublist]

In [None]:
train_test['microbusiness_density_normalized'] = train_test.groupby(by = 'cfips')['microbusiness_density'].apply(normalizer)

In [None]:
#train = train[train['cfips'].isin(hv_cfips)]
#test = test[test['cfips'].isin(hv_cfips)]
#train_test = train_test[train_test['cfips'].isin(hv_cfips)]

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc_cols = ['county', 'state']

for catcol in label_enc_cols:
    train_test.loc[:, catcol] = LabelEncoder().fit_transform(train_test.loc[:, catcol])
    
train_test.head()

In [None]:
lags = [i for i in range(-4,8,1)]

In [None]:
def Features(df:pd.DataFrame = None, lags = lags):
    
    features = ['cfips', 'county', 'state', 'dcount', 'tr_te_spl', 'microbusiness_density_normalized', 'active']
    X_features = ['cfips', 'county', 'state', 'dcount', 'tr_te_spl']
    y_feature = ['microbusiness_density_normalized']

    df['dcount'] = [i for i in range(0, len(df))]
    df['first'] = df['microbusiness_density_normalized'].iloc[0]
    df['last'] = df[df['tr_te_spl'] == 0]['microbusiness_density_normalized'].iloc[-1]
    
    X_features.extend(['dcount', 'first', 'last'])
    
    for i in lags:
        
        if i > 0:
            
            df[f'mbd_t+{i}'] = df['microbusiness_density_normalized'].shift(-i)
            X_features.append(f'mbd_t+{i}')
                               
        elif i <= 0:
                               
            df[f'mbd_t{i}'] = df['microbusiness_density_normalized'].shift(-i)
            X_features.append(f'mbd_t{i}')
        
        
    df.replace([np.inf, - np.inf], np.nan, inplace=True)
    #df.bfill(axis = 0, inplace = True)
    #df.ffill(axis = 0, inplace = True)
    
    df = df.astype({
        'dcount': int
    })

    
    del X_features
    return df

In [None]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

def clip_iqr(df: pd.DataFrame() = None):
    q3 = df.loc[:, 'microbusiness_density_normalized'].quantile(.75) 
    q1 = df.loc[:, 'microbusiness_density_normalized'].quantile(.25)
    iqr = q3 - q1

    upper_thresold = q3 + (1.5 * iqr)
    lower_thresold = q1 - (1.5 * iqr)

    return lower_thresold, upper_thresold

In [None]:
import warnings
warnings.filterwarnings('ignore')

df = train_test.set_index('row_id')[['cfips', 'county', 'state', 'tr_te_spl', 'microbusiness_density_normalized', 'active']]
df = df.groupby('cfips').apply(Features)

df = df.astype({
    'cfips': 'category',
    'county': 'category',
    'state': 'category',
})

In [None]:
import catboost as cat

def get_model():
    cb_params = {
                    'iterations': 1200,
                    'loss_function': 'MAPE',
                    'verbose': 0,
                    'learning_rate': .075,
                    'l2_leaf_reg': .2,
                    'subsample': .5,
                    'max_bin': 4096,
                    'cat_features': list(df.select_dtypes(include = ['category']).columns),
                    #'task_type': 'GPU',
                    #'bootstrap_type': 'Poisson'
    }
    cb_regressor = cat.CatBoostRegressor(**cb_params)
    return cb_regressor

In [None]:
predictions = pd.DataFrame({'cfips': df.cfips.unique()})
target_columns = df.columns.str.contains('\+')
X = df.iloc[:, ~target_columns]
Y = df.iloc[:, (target_columns | df.columns.str.contains('dcount') | df.columns.str.contains('cfips'))]

In [None]:
n_ts = train.groupby(by = 'cfips').cumcount().unique()[-1]

X_tr = X[X.dcount < n_ts]
y_tr = Y[Y.dcount < n_ts][['mbd_t+1']]
X_te = X[X.dcount == n_ts]

model = get_model()
model.fit(X_tr, y_tr)
y_pred = model.predict(X_te)

X_te_aux = X_te.copy()
for ts in range(n_ts + 1, (df['dcount'].unique()[-1] + 1)):
    X_te_aux.iloc[:, -5:-1] = X_te_aux.iloc[:, -4:].values
    X_te_aux['mbd_t0'] = y_pred

    y_pred = model.predict(X_te_aux)

    predictions[f't+{ts}'] = y_pred

In [None]:
submission = df[df.tr_te_spl == 1]
for ts in range(n_ts + 1, (df['dcount'].unique()[-1] + 1)):
    submission.loc[submission.dcount == ts, 'microbusiness_density_normalized'] = list(predictions.loc[:, f't+{ts}'].values)

In [None]:
denorm_list = []
for cfip in train_test['cfips'].unique():
    sdf = train_test[train_test['cfips'] == cfip]
    min_val = sdf['microbusiness_density'].min()
    max_val = sdf['microbusiness_density'].max()
    denorm_vals = denormalizer(submission[submission['cfips'] == cfip]['microbusiness_density_normalized'])
    denorm_list.append(denorm_vals)
    
denorm_list = flatten(denorm_list)
submission['microbusiness_density'] = denorm_list

In [None]:
submission = submission.reset_index()
m = [i for i in submission['row_id'].values]
sample_submission.loc[m, 'microbusiness_density'] = submission.set_index('row_id').loc[m, 'microbusiness_density']
sample_submission.reset_index(inplace = True)
    
display(sample_submission.head())

column_names = ['GEO_ID','NAME','S0101_C01_026E']
df2021 = pd.read_csv('/kaggle/input/census-data-for-godaddy/ACSST5Y2021.S0101-Data.csv', usecols=column_names)
df2021 = df2021.iloc[1:]
df2021['S0101_C01_026E'] = df2021['S0101_C01_026E'].astype('int')
df2021['cfips'] = df2021.GEO_ID.apply(lambda x: int(x.split('US')[-1]))
adult2021 = df2021.set_index('cfips').S0101_C01_026E.to_dict()
sample_submission['cfips'] = sample_submission['row_id'].apply(lambda val: val.split('_')[0]).astype(int)
sample_submission['adult2021'] = sample_submission['cfips'].map(adult2021)

sample_submission['microbusiness_density'] = (
    np.round(sample_submission['microbusiness_density'] * sample_submission['adult2021'] / 100) / sample_submission['adult2021'] * 100
)
sample_submission = sample_submission[['row_id', 'microbusiness_density']]

In [None]:
sample_submission.to_csv('submission.csv', index=False)