# Classification - RandomForest - Major Occupation

In [39]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('occupationFeatureSubset.csv')
dfOHE = pd.read_csv('oheTransformedData.csv')
dfOHE.fillna(0, inplace=True)

Libraries Imported


With knn, you can determine membership probabilities for each of the 3 labels. As you can see, the predict() function just picks the most likely label.

In [40]:
#Creating the dependent variable class
factor = pd.factorize(originalDF['Major_Occupation'])
originalDF['Major_Occupation'] = factor[0]
definitions = factor[1]
print(originalDF['Major_Occupation'].head())
print(definitions)

0    0
1    1
2    2
3    0
4    1
Name: Major_Occupation, dtype: int64
Index(['Professional', 'Education', 'Computer', 'Other', 'Management'], dtype='object')


In [41]:
#Splitting the data into independent and dependent variables
X = dfOHE
#separate target values
y = originalDF['Major_Occupation']

In [42]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [43]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [44]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [45]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to Iris-setosa, Iris-versicolor and Iris-virginica
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations  Computer  Education  Professional
Actual Occupations                                      
Computer                    285         69            94
Education                   113        365            55
Professional                177         48           126


In [46]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodelMO.pkl') 

[('Years_on_Internet-0', 0.030097659444170293), ('Years_on_Internet-1', 0.04264092075623896), ('Years_on_Internet-2', 0.042671159651809246), ('Years_on_Internet-3', 0.035517588046424534), ('Years_on_Internet-4', 0.04044935306777249), ('Web_Ordering-0', 0.0030203361140386112), ('Web_Ordering-1', 0.02076921914077274), ('Web_Ordering-2', 0.01392956492728964), ('Not_Purchasing_Privacy', 0.0), ('Not_Purchasing_Prefer_people', 0.0), ('Not_Purchasing_Too_complicated', 0.0), ('Not_Purchasing_Easier_locally', 0.0), ('Not_Purchasing_Security', 0.0), ('Age', 0.7709041988514834)]


['randomforestmodelMO.pkl']