# Classification - RandomForest - Education Attainment

In [9]:
# Import feature subset with Education_Attainment Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('educationFeatureSubset.csv')
dfOHE = pd.read_csv('oheTransformedData.csv')
dfOHE.fillna(0, inplace=True)

Libraries Imported


In [10]:
#Creating the dependent variable class
factor = pd.factorize(originalDF['Education_Attainment'])
originalDF['Education_Attainment'] = factor[0]
definitions = factor[1]
print(originalDF['Education_Attainment'].head())
print(definitions)

0    0
1    1
2    2
3    2
4    3
Name: Education_Attainment, dtype: int64
Index(['Masters', 'Some_College', 'College', 'High_School', 'Professional',
       'Grammar', 'Special', 'Doctoral', 'Other'],
      dtype='object')


In [11]:
#Splitting the data into independent and dependent variables
X = dfOHE
#separate target values
y = originalDF['Education_Attainment']

In [12]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [13]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [15]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to Iris-setosa, Iris-versicolor and Iris-virginica
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Education_Attainment'], colnames=['Predicted Education_Attainment']))

Predicted Education_Attainment  College  Masters  None  Some_College
Actual Education_Attainment                                         
College                             330       84    32           264
Masters                             177       57    13           101
Some_College                        264       42    76           433


In [16]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodelEA.pkl') 

[('Years_on_Internet-0', 0.014567759034696922), ('Years_on_Internet-1', 0.025131556807920195), ('Years_on_Internet-2', 0.027618413797265873), ('Years_on_Internet-3', 0.0247338192019051), ('Years_on_Internet-4', 0.027087008892898622), ('Web_Ordering-0', 0.004726127384921387), ('Web_Ordering-1', 0.014156521116047021), ('Web_Ordering-2', 0.011821728769800647), ('Not_Purchasing_Privacy', 0.0), ('Not_Purchasing_Prefer_people', 0.0), ('Not_Purchasing_Too_complicated', 0.0), ('Not_Purchasing_Easier_locally', 0.0), ('Not_Purchasing_Security', 0.0), ('Age', 0.8501570649945442)]


['randomforestmodelEA.pkl']