# Classification - RandomForest - Major Occupation

In [25]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('educationFeatureSubset.csv')
dfOHE = pd.read_csv('oheTransformedData.csv')
dfOHE['Education_Attainment'] = pd.Series(originalDF['Education_Attainment'], index=dfOHE.index)
dfOHE.fillna(0, inplace=True)

# Next we check the Education_Attainment options in the dataset for use with classification
print(dfOHE.Education_Attainment.unique())

# Now we replace those values with integers for use with the classification algorithm
education_values = {"Education_Attainment": {"Masters": 1, "Some_College": 2, "College": 3, "High_School": 4, "Professional": 5, "Grammar": 6, "Special": 7, "Doctoral": 8, "Other": 9}}
dfOHE.replace(education_values, inplace=True)

Libraries Imported
['Masters' 'Some_College' 'College' 'High_School' 'Professional' 'Grammar'
 'Special' 'Doctoral' 'Other']


With knn, you can determine membership probabilities for each of the 3 labels. As you can see, the predict() function just picks the most likely label.

In [26]:
#Creating the dependent variable class
factor = pd.factorize(dfOHE['Education_Attainment'])
dfOHE['Education_Attainment'] = factor[0]
definitions = factor[1]
print(dfOHE['Education_Attainment'].head())
print(definitions)

0    0
1    1
2    2
3    2
4    3
Name: Education_Attainment, dtype: int64
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [27]:
#Splitting the data into independent and dependent variables
X, y = dfOHE.loc[:, dfOHE.columns != 'Education_Attainment'], dfOHE['Education_Attainment']

In [28]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [29]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [31]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Education'], colnames=['Predicted Education']))

Predicted Education   1    2    3   4  5   6  7  8  9
Actual Education                                     
1                    57  101  177   7  1   0  2  2  1
2                    42  433  264  63  4   1  4  2  2
3                    84  264  330  22  2   0  4  2  2
4                    18  145   75  59  0   9  4  0  0
5                    10   26   33   3  0   0  1  0  0
6                     0    1    2  22  0  13  0  0  1
7                     6   53   42   3  0   0  1  0  0
8                    26   16   27   3  0   0  1  2  0
9                     3   15   15   8  0   7  3  1  0


In [32]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelEA.pkl') 

[('Years_on_Internet-0', 0.014567759034696922), ('Years_on_Internet-1', 0.025131556807920195), ('Years_on_Internet-2', 0.027618413797265873), ('Years_on_Internet-3', 0.0247338192019051), ('Years_on_Internet-4', 0.027087008892898622), ('Web_Ordering-0', 0.004726127384921387), ('Web_Ordering-1', 0.014156521116047021), ('Web_Ordering-2', 0.011821728769800647), ('Not_Purchasing_Privacy', 0.0), ('Not_Purchasing_Prefer_people', 0.0), ('Not_Purchasing_Too_complicated', 0.0), ('Not_Purchasing_Easier_locally', 0.0), ('Not_Purchasing_Security', 0.0), ('Age', 0.8501570649945442)]


['randomForest/randomforestmodelEA.pkl']

## PCA Transformation

In [33]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

# Now we should split our data into a training set and a test set in order to properly assess our model using PCA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [34]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
pca_model.fit(X_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

# 2-Dimensions
X_train[:5]

array([[-0.35529674, -0.5327504 ],
       [-0.1912675 , -0.47005044],
       [-0.52157889, -0.55681204],
       [-0.54437412,  0.61908668],
       [ 0.80611043,  0.82767697]])

In [35]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Education'], colnames=['Predicted Education']))

Predicted Education   1    2    3   4  5   6  7  8  9
Actual Education                                     
1                    52   87  198   5  3   0  1  4  0
2                    29  471  276  43  2   1  5  2  0
3                    61  234  398  16  3   0  1  8  2
4                     8  151   73  48  0  16  0  0  1
5                     6   31   32   0  1   0  0  1  0
6                     1    7    5  14  0  21  0  0  0
7                     2   72   35   6  1   0  1  0  0
8                    11   17   28   2  1   1  1  3  0
9                     2    8   10   3  0   2  2  1  0


In [37]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelEAPCA.pkl')

[('Years_on_Internet-0', 0.5138969388324922), ('Years_on_Internet-1', 0.4861030611675078)]


['randomForest/randomforestmodelEAPCA.pkl']