## Model Building

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Basic operations
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Machine learning
import lightgbm as lgb


# Sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score 

import lightgbm as lgb




In [3]:
train = pd.read_csv('/Volumes/Transcend/gwu_course/ml1/project/data_cleaned.csv')

In [38]:
train.head()

Unnamed: 0,AGE,AMONTH,ASOURCE,ATYPE,AWEEKEND,DIED,DISCWT,DISPUNIFORM,DQTR,FEMALE,...,PRDAY10,PRDAY9,PRDAY8,PRDAY7,PRDAY6,PRDAY5,PRDAY4,PRDAY3,PRDAY2,PRDAY1
0,48.0,11.0,2,2,0.0,0.0,4.671227,1.0,1,0.0,...,0.0,0.0,0.0,38.0,29.0,1.0,37.0,37.0,37.0,37.0
1,66.0,11.0,2,2,0.0,0.0,4.671227,6.0,1,0.0,...,0.0,0.0,7.0,7.0,3.0,3.0,2.0,0.0,4.0,4.0
2,53.0,11.0,2,2,0.0,0.0,4.671227,5.0,1,0.0,...,0.0,0.0,0.0,0.0,35.0,7.0,29.0,14.0,2.0,0.0
3,27.0,11.0,2,2,0.0,0.0,4.671227,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,19.0,6.0,18.0
4,48.0,11.0,2,2,0.0,1.0,4.671227,20.0,1,0.0,...,0.0,0.0,10.0,7.0,8.0,8.0,1.0,1.0,0.0,0.0


In [4]:
label = train[["RACE", "ASOURCE", "ATYPE", "TOTCHG", "ZIPINC_QRTL"]]
label.head()


Unnamed: 0,RACE,ASOURCE,ATYPE,TOTCHG,ZIPINC_QRTL
0,1,2,2,272123,3
1,1,2,2,209246,1
2,1,2,2,305474,1
3,1,2,2,202973,1
4,1,2,2,416072,1


In [4]:
label.describe()

Unnamed: 0,RACE,ASOURCE,ATYPE,TOTCHG,ZIPINC_QRTL
count,493420.0,493420.0,493420.0,493420.0,493420.0
mean,1.589334,3.824496,1.827269,22653.9,2.636987
std,1.140275,1.759979,1.003866,38141.53,1.191361
min,1.0,1.0,1.0,101.0,1.0
25%,1.0,1.0,1.0,6069.0,1.0
50%,1.0,5.0,1.0,12000.0,3.0
75%,2.0,5.0,3.0,25152.25,4.0
max,6.0,5.0,6.0,1461234.0,4.0


In [5]:
print(train.shape)
train = train.drop(["RACE", "ASOURCE", "ATYPE", "TOTCHG", "ZIPINC_QRTL"],1)
print(train.shape)

(493420, 111)
(493420, 106)


## RACE

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train, label.RACE, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(345394, 106) (345394,)
(148026, 106) (148026,)


In [43]:
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
print("Train score: {}".format(clf.score(X_train,y_train)))
print("Test score: {}".format(clf.score(X_test, y_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

Train score: 0.9772317990468856
Test score: 0.7443219434423682
[[95447  4725  1002    83     6   289]
 [13993  9634  2043    52     3   242]
 [ 5725  2758  3712    29     3   192]
 [ 1480   354   136    26     0    39]
 [  180    18    23     2     2     1]
 [ 2810  1164   465    28     2  1358]]


In [44]:
def rf_para_search(X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    clf = RandomForestClassifier( max_depth=30)
    param_grid = {"max_depth": [10,20,30,40,50]
    }
    # set different n_estimators in the 
    CV_clf = GridSearchCV(estimator= clf, param_grid=param_grid, cv=5)
    CV_clf.fit(X_train, y_train)
    params = CV_clf.best_params_
    print('best para:', params)
    return params

params = rf_para_search(X_train, y_train)
clf = RandomForestClassifier(max_depth=params["max_depth"])
clf.fit(X_train,y_train)
print("Train score: {}".format(clf.score(X_train,y_train)))
print("Test score: {}".format(clf.score(X_test, y_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

best para: {'max_depth': 20}
Train score: 0.8605650358720766
Test score: 0.7530771621201681
[[96235  4505   672     9     0   131]
 [13826  9741  2279     1     0   120]
 [ 5765  2337  4184     3     0   130]
 [ 1484   411   111     2     0    27]
 [  187    16    19     0     2     2]
 [ 2752  1285   477     2     0  1311]]


## ASOURCE

In [47]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

0.75346125475915771

In [48]:
X_train, X_test, y_train, y_test = train_test_split(train, label.ASOURCE, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf2 = RandomForestClassifier(max_depth=30)
clf2.fit(X_train,y_train)
print("Train score: {}".format(clf.score(X_train,y_train)))
print("Test score: {}".format(clf.score(X_test, y_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

(345394, 106) (345394,)
(148026, 106) (148026,)
Train score: 0.20174062085618163
Test score: 0.20488968154243173
[[29550  4969  2636    78    20   521]
 [ 4342   661    70    12     1    46]
 [ 3779   425   101    16     1    20]
 [   38     3     1     0     0     0]
 [78374 14197  6062   196    17  1890]
 [    0     0     0     0     0     0]]


In [50]:
X_train, X_test, y_train, y_test = train_test_split(train, label.ASOURCE, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf2 = RandomForestClassifier(max_depth=30)
clf2.fit(X_train,y_train)
print("Train score: {}".format(clf.score(X_train,y_train)))
print("Test score: {}".format(clf.score(X_test, y_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

(345394, 106) (345394,)
(148026, 106) (148026,)
Train score: 0.20276264208411263
Test score: 0.20250496534392606
[[29238  4868  2601    78    19   522]
 [ 4364   608    64     8     2    51]
 [ 3814   404   108     8     1    28]
 [   40     2     1     0     0     1]
 [78945 14168  6011   194    22  1856]
 [    0     0     0     0     0     0]]


In [49]:
def rf_para_search(X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    clf = RandomForestClassifier( max_depth=30)
    param_grid = {"max_depth": [10,20,30,40,50]
    }
    # set different n_estimators in the 
    CV_clf = GridSearchCV(estimator= clf, param_grid=param_grid, cv=5)
    CV_clf.fit(X_train, y_train)
    params = CV_clf.best_params_
    print('best para:', params)
    return params

params = rf_para_search(X_train, y_train)
clf2 = RandomForestClassifier(max_depth=params["max_depth"])
clf2.fit(X_train,y_train)
print("Train score: {}".format(clf2.score(X_train,y_train)))
print("Test score: {}".format(clf2.score(X_test, y_test)))
print(confusion_matrix(y_test, clf2.predict(X_test)))

best para: {'max_depth': 30}
Train score: 0.9924839458705131
Test score: 0.8872225149635875
[[33900   286   143     0  3445]
 [ 1240  1958    63     0  1871]
 [ 1356   100   804     0  2082]
 [   25     1     0     0    16]
 [ 5126   574   366     0 94670]]


In [51]:
scores = cross_val_score(clf2, X_train, y_train, cv=5)
scores.mean()

0.8877658367355471

## ATYPE

In [53]:
X_train, X_test, y_train, y_test = train_test_split(train, label.ATYPE, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf3 = RandomForestClassifier(max_depth=30)
clf3.fit(X_train,y_train)
print("Train score: {}".format(clf3.score(X_train,y_train)))
print("Test score: {}".format(clf3.score(X_test, y_test)))
print(confusion_matrix(y_test, clf3.predict(X_test)))

(345394, 106) (345394,)
(148026, 106) (148026,)
Train score: 0.9825243055756614
Test score: 0.8404199262291759
[[73210  2454  2493    39    12    23]
 [ 8920 16161  1771    54     0     3]
 [ 5575  1901 25785    23     1     4]
 [    4     9     2  9163     0     0]
 [  166     2     9     0    42     0]
 [  122    23    12     0     0    43]]


In [54]:
scores = cross_val_score(clf3, X_train, y_train, cv=5)
scores.mean()

0.83785761625101429

 ## ZIPINC_QRTL

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train, label.ZIPINC_QRTL, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf5 = RandomForestClassifier(max_depth=30)
clf5.fit(X_train,y_train)
print("Train score: {}".format(clf5.score(X_train,y_train)))
print("Test score: {}".format(clf5.score(X_test, y_test)))
print(confusion_matrix(y_test, clf5.predict(X_test)))

(345394, 106) (345394,)
(148026, 106) (148026,)
Train score: 0.9520315929054934
Test score: 0.5656708956534663
[[25341  4975  4573  3478]
 [ 7619  9566  5047  4049]
 [ 5944  5024 12103 10991]
 [ 3396  2810  6386 36724]]


In [13]:
scores = cross_val_score(clf5, X_train, y_train, cv=5)
scores.mean()

0.56296578992901058

## Light GBM

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train, label.TOTCHG, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape,
      y_test.shape)

(345394, 106) (345394,)
(148026, 106) (148026,)


In [7]:
rounds = 1000

param  = {
    'objective': 'regression',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'num_boost_round': rounds
}

In [8]:
data_lgb = lgb.Dataset(X_train, y_train)
lgbm = lgb.train(param, data_lgb)
pred = lgbm.predict(X_test)
pred

array([  19968.86621762,    5400.47920406,   38163.50593078, ...,
         10028.18628369,    5396.64880823,  103577.23705564])

In [14]:
r2_score(y_test, pred)


0.52484870183752719

In [29]:
label.ZIPINC_QRTL=label.ZIPINC_QRTL.astype('category')
label.ZIPINC_QRTL.value_counts()

4    164666
1    128451
3    113421
2     86882
Name: ZIPINC_QRTL, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(train, label.ZIPINC_QRTL, test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(345394, 106) (345394,)
(148026, 106) (148026,)


In [60]:
rounds = 1000

param  = {
    'objective': 'multiclass',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'num_boost_round': rounds,
    'num_class': 5
}

In [61]:
data_lgb = lgb.Dataset(X_train, y_train)
lgbm = lgb.train(param, data_lgb)
pred = lgbm.predict(X_test)
pred

array([[  1.95152189e-16,   1.61164432e-01,   8.31938048e-02,
          1.38919026e-01,   6.16722738e-01],
       [  2.19579500e-16,   4.50439749e-02,   6.51431907e-01,
          1.67207592e-01,   1.36316526e-01],
       [  2.01676699e-16,   3.81760652e-02,   8.84571163e-02,
          2.14952285e-01,   6.58414534e-01],
       ..., 
       [  2.37927232e-16,   1.20599972e-01,   1.56787922e-01,
          1.80023957e-01,   5.42588149e-01],
       [  2.35450683e-16,   4.26212322e-02,   2.26183140e-01,
          3.45716025e-01,   3.85479602e-01],
       [  2.60814874e-16,   3.91263730e-01,   4.96391634e-01,
          9.31818274e-02,   1.91628081e-02]])

In [63]:
pred1.head()

Unnamed: 0,0,1,2,3,4
0,1.951522e-16,0.161164,0.083194,0.138919,0.616723
1,2.195795e-16,0.045044,0.651432,0.167208,0.136317
2,2.016767e-16,0.038176,0.088457,0.214952,0.658415
3,2.113831e-16,0.451835,0.220458,0.280398,0.047309
4,1.362274e-16,0.012482,0.013904,0.120249,0.853365


In [62]:
pred1 = pd.DataFrame(pred)

In [58]:
max_ = pred1.max(1)

## Stochastic Gradient Descent - SGD

In [13]:
from sklearn.linear_model import SGDClassifier

In [14]:
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X_train, y_train)


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [22]:
y_pred =clf.predict(X_test)
print(y_pred)
np.unique(y_pred)

[3 3 3 ..., 3 3 3]


array([3])

In [21]:
acc_sgd = round(clf.score(X_train, y_train) * 100, 2)
acc_sgd

8.4100000000000001