In [52]:
#Initialization

import numpy as np
import pandas as pd
from IPython.display import display

%matplotlib inline

in_file = 'finaldata.csv'
full_data = pd.read_csv(in_file)

#Loading Test

display(full_data.head())

Unnamed: 0,Income (x100k),F. Income,Occ.,City,State,Country,Sex,Partner
0,50.0,25000,SALES,Rancho,CA,US,P,1
1,40.0,13000,TEACHER,Chattanooga,TN,US,M,0
2,40.0,10000,COACH,Gainesville,GA,US,P,1
3,30.0,50000,MGT,Ontario,CA,US,P,1
4,30.0,0,MGT,Suwanee,GA,US,P,1


In [53]:
# Store "Income" feature in new variable and remove from the dataset

income = full_data['Income (x100k)']
data = full_data.drop('Income (x100k)', axis = 1)

# Test data drop

display(data.head())

Unnamed: 0,F. Income,Occ.,City,State,Country,Sex,Partner
0,25000,SALES,Rancho,CA,US,P,1
1,13000,TEACHER,Chattanooga,TN,US,M,0
2,10000,COACH,Gainesville,GA,US,P,1
3,50000,MGT,Ontario,CA,US,P,1
4,0,MGT,Suwanee,GA,US,P,1


In [54]:
# Normalization of numerical features will occur since the features are highly skewed.

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical = ['F. Income']
data[numerical] = scaler.fit_transform(data[numerical])

# Before benchmarking, the data points will be randomly shuffled to prevent inaccuracies

display(data.head())

Unnamed: 0,F. Income,Occ.,City,State,Country,Sex,Partner
0,0.05,SALES,Rancho,CA,US,P,1
1,0.026,TEACHER,Chattanooga,TN,US,M,0
2,0.02,COACH,Gainesville,GA,US,P,1
3,0.1,MGT,Ontario,CA,US,P,1
4,0.0,MGT,Suwanee,GA,US,P,1


In [55]:
# Some basic preprocessing will occure here, as categorical values will be one-hot encoded.

features = pd.get_dummies(data)
income_hot = pd.get_dummies(income)

encoded = list(features.columns)

print "{} total features after one-hot enconding.".format(len(encoded))

1145 total features after one-hot enconding.


In [56]:
# Before conducting our Benchmark model, the data set will be split into training and testing sets.
# Initially, 85% of the data will be used for training and 15% will be used for testing.

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, income_hot, test_size = 0.15, random_state = 0)

print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 2190 samples.
Testing set has 387 samples.


In [57]:
# It is now time to train our benchmark model!

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [58]:
# In this section, the f1 score is evaluated for the "out-of-the-box"
# Random Forest Classifier used as our benchmark.

from sklearn.metrics import f1_score

pred = clf.predict(X_test)
score = f1_score(y_test, pred, average = None)

print ("Random Forest F1-score is {}".format(round(score[0], 3)))

Random Forest F1-score is 0.88


In [59]:
# Now that our benchmark has been set, we will attempt to optimize our 
# classification method by editing all available parameters

import matplotlib.pyplot as plt

clf2 = RandomForestClassifier(n_estimators=30, max_depth = 30, 
                              bootstrap = False)
clf2.fit(X_train, y_train)

pred2 = clf2.predict(X_test)
score2 = f1_score(y_test, pred2, average = None)

print ("Optimized Random Forest F1-score is {}".format(round(score2[0], 3)))

importances = clf2.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf2.estimators_], axis = 0)
indices = np.argsort(importances)[::-1]

print("Feature Ranking:")

for f in range(data.shape[1]):
    print("%d. Feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# By analyzing one-hot encoded indices the following features can be analyzed.

Optimized Random Forest F1-score is 0.889
Feature Ranking:
1. Feature 33 (0.101728)
2. Feature 0 (0.078445)
3. Feature 27 (0.041865)
4. Feature 34 (0.032209)
5. Feature 16 (0.016278)
6. Feature 21 (0.012122)
7. Feature 647 (0.011899)


In [60]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

y_train_re = np.ravel(y_train)

sup = SVC()
sup.fit(X_train, y_train_re[:2190])

pred_SVC = sup.predict(X_test)
score_SVC = f1_score(y_test, pred, average = None)

print ("SVC F1-score is {}".format(round(score[0], 3)))


SVC F1-score is 0.88


In [63]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

kf = KFold(n_splits = 2)
kf.get_n_splits(X_train)

for train_index, test_index in kf.split(X_train):
    X_train_kf, X_test_kf = features.iloc[train_index], features.iloc[test_index]
    y_train_kf, y_test_kf = income_hot.iloc[train_index], income_hot.iloc[test_index]

clf = RandomForestClassifier()
clf.fit(X_train, y_train) 

pred = clf_kf.predict(X_test)
score_kf = f1_score(y_test, pred, average = None)

print ("Random Forest F1-score is {}".format(round(score[0], 3)))

Random Forest F1-score is 0.88
