In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
from collections import Counter

In [2]:
train = pd.read_csv('ProjData.csv')

In [3]:
train = train.fillna(train.median())

In [4]:
train['net_mig'] = train['net_mig'] > 0

In [None]:
Counter(train['net_mig'])

Counter({True: 11485, False: 13638})

As we can see by the counter here, our data is balanced between positive net migration and negative net migration. This means we should not have to overweight any category to make up for unbalanced data.

In [None]:
X = train.drop('net_mig', axis = 1)
y = train['net_mig']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 99)
x_tr, x_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = .2, random_state = 99)

In [None]:
param_dict = {"n_estimators": [50], "max_depth": [15]}
gb = GradientBoostingClassifier(loss = 'exponential')
gs = GridSearchCV(gb, param_dict, scoring = 'f1', cv = 4)
gs.fit(x_tr, y_tr)

In [None]:
train_predictions = gs.predict(x_tr)
train_f1 = f1_score(y_tr, train_predictions)
val_predictions = gs.predict(x_val)
val_f1 = f1_score(y_val, val_predictions)

print(train_f1)
print(val_f1)

In [None]:
gs.best_estimator_

In [None]:
test_predictions = gs.predict(X_test)
test_f1 = f1_score(y_test, test_predictions)
print(test_f1)