In [1]:
# dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import sklearn.metrics
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [2]:
# function to time how long the parameter opt goes
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
# read in county data with pandas
df2016 = pd.read_csv('../Data/Final Data/data2016.csv')
df2019 = pd.read_csv('../Data/Final Data/data2019.csv')
df2016.head()

Unnamed: 0,State,ST,FIPS,County,Democrats 2016,Republicans 2016,Green 2016,Libertarians 2016,winner,poor_health,...,unemployment,children_poverty,income_inequality,single_parent,violent_crime,air_polution,non_hispanic_african_american,non_hispanic_white,not_proficient_english,household_income
0,Alabama,AL,1001,"Autauga County, Alabama",0.239569,0.734358,0.004258,0.021816,republican,0.194,...,0.059,0.181,4.303,0.273,253.645,12.92,0.185,0.756,0.005,54366
1,Alabama,AL,1003,"Baldwin County, Alabama",0.195653,0.773515,0.004815,0.026018,republican,0.16,...,0.061,0.198,4.495,0.282,220.665,13.13,0.094,0.83,0.014,49626
2,Alabama,AL,1005,"Barbour County, Alabama",0.466603,0.522714,0.001732,0.008951,republican,0.257,...,0.108,0.381,5.286,0.545,146.889,12.62,0.472,0.466,0.022,34971
3,Alabama,AL,1007,"Bibb County, Alabama",0.21422,0.769662,0.001943,0.014175,republican,0.22,...,0.071,0.268,4.251,0.32,235.952,12.87,0.219,0.745,0.005,39546
4,Alabama,AL,1009,"Blount County, Alabama",0.084699,0.898519,0.003506,0.013276,republican,0.207,...,0.061,0.241,4.124,0.283,219.034,12.66,0.016,0.878,0.018,45567


In [4]:
# assign inputs and output values
X = df2016.iloc[:,9:]
y = df2016.iloc[:,8]

inputs = df2019.iloc[:,4:]

In [5]:
# scikitlearn dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

from tensorflow.keras.utils import to_categorical
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [6]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [7]:
# setting up model
xgb = XGBClassifier(learning_rate=0.05, n_estimators=300, objective='binary:logistic',
                    silent=True, nthread=1)

In [8]:
# simple paramter tuning to start off
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    7.7s finished



 Time taken: 0 hours 0 minutes and 8.8 seconds.


In [9]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([0.75678191, 0.99836626, 0.94766092, 0.8206161 , 1.01570425]), 'std_fit_time': array([0.03371489, 0.03314471, 0.01345629, 0.0155626 , 0.09484075]), 'mean_score_time': array([0.00258613, 0.0065659 , 0.00738783, 0.003795  , 0.00578036]), 'std_score_time': array([0.0004981 , 0.00047718, 0.0007854 , 0.00073884, 0.00097416]), 'param_subsample': masked_array(data=[1.0, 0.6, 0.8, 1.0, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 5, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 5, 5, 5, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[5, 1.5, 1, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dt

In [10]:
# fitting grid search
grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='accuracy', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3 )
grid.fit(X_train, y_train)

print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)
results.to_csv('xgb-grid-search-results-01.csv', index=False)

y_test1 = grid.best_estimator_.predict_proba(inputs)
results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test1[:,1]})
results_df.to_csv('submission-grid-search-xgb-porto-01.csv', index=False)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   24.6s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 1560 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done 2025 out of 2025 | elapsed:  9.4min finished



 Best parameters:
{'colsample_bytree': 0.6, 'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8}


NameError: name 'test_df' is not defined

In [11]:
# building model
import xgboost as xgb
model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.6, verbosity=1)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.6, verbosity=1)

In [12]:
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=15, eval_metric=["error", "logloss"], eval_set=eval_set,verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.05, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.6, verbosity=1)

In [13]:
# best result
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 94.09%


In [14]:
# save model
model.save_model('CountyVotingPredictor.h5')

In [15]:
results = model.predict(inputs)

In [16]:
results

array([1, 1, 0, ..., 1, 1, 1])

In [17]:
len(results)

3109

In [18]:
sum(results)

2648