In this notebook, I'm going to try combining both the spatial encoding as well as oversampling. I'm going to try random oversampling as well as SMOTE. This will still be a classification approach.

In [1]:
import pandas as pd

train = pd.read_csv('../data/train_with_rounded_score.csv')
test = pd.read_csv('../data/test_with_rounded_score.csv')

In [2]:
import json
fips_map = json.load(open('../data/fips_map.json'))

In [3]:
from functools import lru_cache

@lru_cache(maxsize=10000)
def fips_to_coordinate(fips_code):
    fips_code = str(fips_code)
    if fips_code in fips_map:
        return [fips_map[fips_code]['lat'], fips_map[fips_code]['long']]
    else:
        return [None, None]

In [4]:
#geocode all of the fips codes in train and test
train_coords = train['fips'].apply(fips_to_coordinate)
test_coords = test['fips'].apply(fips_to_coordinate)

In [5]:
train_coords = pd.DataFrame(train_coords.tolist(), columns=['lat', 'long'])
test_coords = pd.DataFrame(test_coords.tolist(), columns=['lat', 'long'])

In [6]:
# Add the coordinates to the train and test data
train = pd.concat([train, train_coords], axis=1)
test = pd.concat([test, test_coords], axis=1)

# Drop the fips column
train = train.drop('fips', axis=1)
test = test.drop('fips', axis=1)

train = train.drop('date',axis=1)
test = test.drop('date',axis=1)

In [7]:
# oversample
from imblearn.over_sampling import RandomOverSampler, SMOTE
r = RandomOverSampler(random_state=42)
s = SMOTE(random_state=42)
trainX_randOS, trainY_randOS = r.fit_resample(train.drop(['score'], axis=1), train['score'])
trainX_smote, trainY_smote = s.fit_resample(train.drop(['score'],axis=1), train['score'])

In [8]:
trainX_randOS = pd.DataFrame(trainX_randOS, columns=train.drop(['score'], axis=1).columns)
trainY_randOS = pd.Series(trainY_randOS)

trainX_smote = pd.DataFrame(trainX_smote, columns=train.drop(['score'],axis=1).columns)
trainY_smote = pd.Series(trainY_smote)

In [9]:
print(trainY_randOS.value_counts())
print(trainY_smote.value_counts())

1    1652230
2    1652230
3    1652230
4    1652230
5    1652230
0    1652230
Name: score, dtype: int64
1    1652230
2    1652230
3    1652230
4    1652230
5    1652230
0    1652230
Name: score, dtype: int64


In [None]:
from catboost import CatBoostClassifier
import pickle

cb_rand = CatBoostClassifier()
cb_rand.fit(trainX_randOS, trainY_randOS)
pickle.dumps(cb_rand, open('catboost_spatial_randomoversampling.pkl','wb'))

In [None]:
cb_smote = CatBoostClassifier()
cb_smote.fit(trainX_smote, trainY_smote)
pickle.dumps(cb_rand, open('catboost_spatial_smote.pkl','wb'))