In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier

## Constants

In [2]:
TRAIN_FILE = 'data/train.csv'
TEST_FILE = 'data/test.csv'
REVEALED_TEST_FILE = 'data/revealed_test.csv'
CENSUS_FILE = 'data/census_starter.csv'

## Utilities

In [3]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def one_hot(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df=pd.concat([df,pd.get_dummies(df[each],prefix=each, drop_first=True)],axis=1).drop([each],axis=1)
    return df

def fill_na(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df[each] = df[each].fillna(df[each].mode().iloc[0])
    df = df.fillna(df.median())
    return df

## Load Data

In [4]:
train_raw = load_data(TRAIN_FILE)
test_raw = load_data(REVEALED_TEST_FILE)
census_raw = load_data(CENSUS_FILE)

In [5]:
train = pd.merge(left = train_raw, right = census_raw, how = 'left')
test = pd.merge(left = test_raw, right = census_raw, how = 'left')
all_data = pd.concat([train, test])

## Data Cleaning

In [7]:
all_data = all_data.drop(['row_id'], axis = 1)

## Feature Engineering

In [11]:
NUM_TRAIN = len(train)
NUM_TEST = len(test)

In [14]:
# One hot and imputation
all_data = fill_na(all_data)
all_data = one_hot(all_data)
all_data = all_data[~all_data.isin([np.nan, np.inf, -np.inf]).any(1)]

train = all_data[:NUM_TRAIN]
test = all_data[NUM_TRAIN:]

# GBM

## Train

In [15]:
X_train = train.drop(['microbusiness_density'], axis = 1)
y_train = train['microbusiness_density']
X_test = test.drop(['microbusiness_density'], axis = 1)
y_test = test['microbusiness_density']

In [16]:
N_ESTIMATORS = 200
MAX_DEPTH = 5
SUBSAMPLE = 0.8
LEARNING_RATE = 0.01

In [17]:
gradient_booster = GradientBoostingRegressor(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH, subsample=SUBSAMPLE, learning_rate=LEARNING_RATE)
gradient_booster.fit(X_train, y_train)

KeyboardInterrupt: 

## Inference

In [None]:
test_predictions = gradient_booster.predict(X_test)

In [None]:
testauc = roc_auc_score(y_test, test_predictions)
print('AUC is', auc)
fpr, tpr, _ = roc_curve(y_test,  test_predictions)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()_predictions = gradient_booster.predict_proba(X_test)[:,1]

# 5-NN

You will probably need to regularize the data for KNN