In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier

In [247]:
from k_means_constrained import KMeansConstrained

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [4]:
TRAIN_FILE = 'data/train.csv'
TEST_FILE = 'data/test.csv'
REVEALED_TEST_FILE = 'data/revealed_test.csv'
CENSUS_FILE = 'data/census_starter.csv'

In [5]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def one_hot(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df=pd.concat([df,pd.get_dummies(df[each],prefix=each, drop_first=True)],axis=1).drop([each],axis=1)
    return df

def fill_na(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df[each] = df[each].fillna(df[each].mode().iloc[0])
    df = df.fillna(df.median())
    return df

In [233]:
train_raw = load_data(TRAIN_FILE)
test_raw = load_data(REVEALED_TEST_FILE)
census_raw = load_data(CENSUS_FILE)
all_data = pd.concat([train_raw, test_raw])
NUM_TRAIN = round(len(all_data)*0.8)
train_raw = all_data[:NUM_TRAIN]
test_raw = all_data[NUM_TRAIN:]

In [242]:
census_raw = pd.merge(left = all_data, right = census_raw, how = 'left')
census_raw = census_raw.drop(['row_id', 'first_day_of_month', 'microbusiness_density', 'active', 'state'], axis = 1).groupby(['county']).first()
census_raw = census_raw.fillna(0)
from sklearn.cluster import KMeans
N_CLUSTERS = 20
kmeans = KMeans(n_clusters=N_CLUSTERS).fit_predict(census_raw)
clusters = []
for cluster in range(N_CLUSTERS):
    clusters += [list(census_raw.index[np.where(kmeans == cluster)])]
print(clusters)

[['Acadia Parish', 'Alcona County', 'Alcorn County', 'Alpena County', 'Appanoose County', 'Arenac County', 'Aroostook County', 'Ashe County', 'Assumption Parish', 'Audrain County', 'Avery County', 'Avoyelles Parish', 'Baraga County', 'Barren County', 'Bath County', 'Blackford County', 'Bollinger County', 'Bourbon County', 'Bronx County', 'Caddo Parish', 'Caldwell Parish', 'Calloway County', 'Carlisle County', 'Carter County', 'Catahoula Parish', 'Catron County', 'Chautauqua County', 'Chouteau County', 'Cibola County', 'Clare County', 'Copiah County', 'De Soto Parish', 'Deer Lodge County', 'Delaware County', 'Dent County', 'DoÃ±a Ana County', 'Dunklin County', 'Duplin County', 'Edmonson County', 'Elk County', 'Fleming County', 'Forrest County', 'Garden County', 'Gladwin County', 'Gogebic County', 'Golden Valley County', 'Graves County', 'Grayson County', 'Greenwood County', 'Grenada County', 'Hickman County', 'Hidalgo County', 'Hinds County', 'Hooker County', 'Houghton County', 'Howell 

[['Acadia Parish', 'Alamosa County', 'Alcona County', 'Alcorn County', 'Alexander County', 'Alleghany County', 'Allen Parish', 'Allendale County', 'Alpena County', 'Amite County', 'Apache County', 'Appanoose County', 'Appling County', 'Arenac County', 'Aroostook County', 'Ashley County', 'Assumption Parish', 'Atkinson County', 'Attala County', 'Avoyelles Parish', 'Baca County', 'Bacon County', 'Barbour County', 'Barren County', 'Bath County', 'Baxter County', 'Bell County', 'Ben Hill County', 'Bent County', 'Berrien County', 'Bertie County', 'Bienville Parish', 'Blackford County', 'Bladen County', 'Bolivar County', 'Boone County', 'Boundary County', 'Bourbon County', 'Bradley County', 'Brantley County', 'Breathitt County', 'Brooks County', 'Bullock County', 'Burke County', 'Butler County', 'Caddo Parish', 'Caldwell Parish', 'Calloway County', 'Camas County', 'Candler County', 'Carlisle County', 'Carter County', 'Casey County', 'Catahoula Parish', 'Chambers County', 'Charlton County', '

In [243]:
for cluster in clusters:
    train_df = train_raw[train_raw['county'].isin(cluster)]
    train_df=train_df.groupby(['county', 'first_day_of_month']).first()
    train_mean = train_df['microbusiness_density'].mean()
    train_df['microbusiness_density-3'] = train_df['microbusiness_density'].shift(1, fill_value=train_mean)
    train_df['microbusiness_density-2'] = train_df['microbusiness_density'].shift(2, fill_value=train_mean)
    train_df['microbusiness_density-1'] = train_df['microbusiness_density'].shift(3, fill_value=train_mean)

    test_df = test_raw[test_raw['county'].isin(cluster)]
    test_df=test_df.groupby(['county', 'first_day_of_month']).first()
    test_mean = test_df['microbusiness_density'].mean()
    test_df['microbusiness_density-3'] = test_df['microbusiness_density'].shift(1, fill_value=test_mean)
    test_df['microbusiness_density-2'] = test_df['microbusiness_density'].shift(2, fill_value=test_mean)
    test_df['microbusiness_density-1'] = test_df['microbusiness_density'].shift(3, fill_value=test_mean)

    import numpy as np
    from sklearn.linear_model import LinearRegression
    X = np.zeros((len(train_df), 3))
    Y = np.zeros((len(train_df),))
    X[:, 0] = train_df['microbusiness_density-3']
    X[:, 1] = train_df['microbusiness_density-2']
    X[:, 2] = train_df['microbusiness_density-1']
    Y[:] = train_df['microbusiness_density']
    test_X = np.zeros((len(test_df), 3))
    test_Y = np.zeros((len(test_df),))
    test_X[:, 0] = test_df['microbusiness_density-3']
    test_X[:, 1] = test_df['microbusiness_density-2']
    test_X[:, 2] = test_df['microbusiness_density-1']
    test_Y[:] = test_df['microbusiness_density']
    reg = GradientBoostingRegressor(n_estimators = 10).fit(X, Y)
    print(reg.score(X, Y))
    print(reg.score(test_X, test_Y))

0.8248486509697642
0.6759524317186393
0.828402190056194
0.49466096248371794
0.8358358036994769
-0.7326765880559494
0.8687487133325877
0.49813242843819017
0.840638997188922
0.37807802240382504
0.8202647811369912
0.704387548520325
0.8458008708097438
0.2988347214295014
0.8418645536921793
0.5315525168587674
0.8151061065141901
0.36487893659584547
0.8355135301623865
0.09755723440264663
0.806936086554982
0.5061143280890023
0.7974808737101328
0.5914436660345268
0.8154202505369239
0.781714844237513
0.8185865784874707
0.7521978281229479
0.8299179307293104
0.6795517197041788
0.8382717178853412
0.7294718938781699
0.8444171625900075
0.7504629154612688


ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required.

In [241]:
train_df=train_raw.groupby(['county', 'first_day_of_month']).first()
train_mean = train_df['microbusiness_density'].mean()
train_df['microbusiness_density-3'] = train_df['microbusiness_density'].shift(1, fill_value=train_mean)
train_df['microbusiness_density-2'] = train_df['microbusiness_density'].shift(2, fill_value=train_mean)
train_df['microbusiness_density-1'] = train_df['microbusiness_density'].shift(3, fill_value=train_mean)
# c=set(train_raw.groupby(['county']).first().index)
test_df=test_raw.groupby(['county', 'first_day_of_month']).first()
test_mean = test_df['microbusiness_density'].mean()
test_df['microbusiness_density-3'] = test_df['microbusiness_density'].shift(1, fill_value=test_mean)
test_df['microbusiness_density-2'] = test_df['microbusiness_density'].shift(2, fill_value=test_mean)
test_df['microbusiness_density-1'] = test_df['microbusiness_density'].shift(3, fill_value=test_mean)

import numpy as np
from sklearn.linear_model import LinearRegression
X = np.zeros((62190, 3))
Y = np.zeros((62190,))
X[:, 0] = train_df['microbusiness_density-3']
X[:, 1] = train_df['microbusiness_density-2']
X[:, 2] = train_df['microbusiness_density-1']
Y[:] = train_df['microbusiness_density']
test_X = np.zeros((21229, 3))
test_Y = np.zeros((21229,))
test_X[:, 0] = test_df['microbusiness_density-3']
test_X[:, 1] = test_df['microbusiness_density-2']
test_X[:, 2] = test_df['microbusiness_density-1']
test_Y[:] = test_df['microbusiness_density']
reg = GradientBoostingRegressor(n_estimators = 30).fit(X, Y)
print(reg.score(X, Y))
print(reg.score(test_X, test_Y))

0.9490150207627007
0.8076974185143109


In [227]:
sum(train_raw['county'].isin(clusters[4]))

15546

In [171]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.zeros((62190, 3))
Y = np.zeros((62190,))
X[:, 0] = train_df['microbusiness_density-3']
X[:, 1] = train_df['microbusiness_density-2']
X[:, 2] = train_df['microbusiness_density-1']
Y[:] = train_df['microbusiness_density']
test_X = np.zeros((21229, 3))
test_Y = np.zeros((21229,))
test_X[:, 0] = test_df['microbusiness_density-3']
test_X[:, 1] = test_df['microbusiness_density-2']
test_X[:, 2] = test_df['microbusiness_density-1']
test_Y[:] = test_df['microbusiness_density']
reg = GradientBoostingRegressor(n_estimators = 800).fit(X, Y)
print(reg.score(X, Y))
print(reg.score(test_X, test_Y))

0.9841958289091283
0.8087861628572404


In [117]:
for i in range(0, 72969, 39):
    a.loc[a.index[i+1:i+39]]['microbusiness_density-1'] = a.loc[a.index[i:i+38]]['microbusiness_density']

In [120]:
a['microbusiness_density-1']

county            first_day_of_month
Abbeville County  2019-08-01            1.335653
                  2019-09-01            1.197482
                  2019-10-01            1.258892
                  2019-11-01            1.228187
                  2019-12-01            1.217952
                                          ...   
Ziebach County    2022-06-01            0.354970
                  2022-07-01            0.304260
                  2022-08-01            0.304260
                  2022-09-01            0.304260
                  2022-10-01            0.304260
Name: microbusiness_density-1, Length: 72969, dtype: float64

In [23]:
counties = set(train_raw['county'])
county_statistics = {}
for c in counties:
    county_statistics[c] = {}
    mean = train_raw[c].mean()
    county_statistics[c]['mean'] = mean
    county_statistics[c]['var'] = train_raw[c].var()
    p95 = train_raw[c].quantile(0.95)
    p05 = train_raw[c].quantile(0.05)
    county_statistics[c]['p95'] = p95
    county_statistics[c]['p05'] = p05
    county_statistics[c]['p75'] = (train_raw[c].quantile(0.75) - mean)/ (p95 - mean)
    county_statistics[c]['p25'] = (mean - train_raw[c].quantile(0.25))/ (mean - p05)
new_knn_all_data = 

KeyError: 'Lassen County'

In [17]:
from sklearn.neighbors import KNeighborsRegressor
knn_train = pd.merge(left = train_raw, right = census_raw, how = 'left')
knn_test = pd.merge(left = test_raw, right = census_raw, how = 'left')
knn_all_data = pd.concat([knn_train, knn_test])
knn_all_data = knn_all_data.drop(['row_id'], axis = 1)
knn_all_data = knn_all_data.drop(['first_day_of_month'], axis = 1)
knn_all_data = fill_na(knn_all_data)
knn_all_data = one_hot(knn_all_data)
NUM_TRAIN = round(len(knn_all_data)*0.8)
knn_train = knn_all_data[:NUM_TRAIN]
knn_test = knn_all_data[NUM_TRAIN:]

X_knn_train = knn_train.drop(['microbusiness_density'], axis = 1)
y_knn_train = knn_train['microbusiness_density']
X_knn_test = knn_test.drop(['microbusiness_density'], axis = 1)
y_knn_test = knn_test['microbusiness_density']

  df = df.fillna(df.median())


In [None]:
knn_

In [18]:
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_knn_train, y_knn_train)
knn.score(X_knn_test,y_knn_test)

-4.939347827427617