# Classification - RandomForest - Major Occupation

In [17]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('occupationFeatureSubset.csv')
dfOHE = pd.read_csv('oheTransformedData.csv')
dfOHE['Major_Occupation'] = pd.Series(originalDF['Major_Occupation'], index=dfOHE.index)
dfOHE.fillna(0, inplace=True)

# Next we check the Major_Occupation options in the dataset for use with classification
print(dfOHE.Major_Occupation.unique())

# Now we replace those values with integers for use with the classification algorithm
occupation_values = {"Major_Occupation": {"Professional": 1, "Education": 2, "Computer": 3, "Other": 4, "Management": 5}}
dfOHE.replace(occupation_values, inplace=True)

Libraries Imported
['Professional' 'Education' 'Computer' 'Other' 'Management']


With knn, you can determine membership probabilities for each of the 3 labels. As you can see, the predict() function just picks the most likely label.

In [18]:
#Creating the dependent variable class
factor = pd.factorize(dfOHE['Major_Occupation'])
dfOHE['Major_Occupation'] = factor[0]
definitions = factor[1]
print(dfOHE['Major_Occupation'].head())
print(definitions)

0    0
1    1
2    2
3    0
4    1
Name: Major_Occupation, dtype: int64
Int64Index([1, 2, 3, 4, 5], dtype='int64')


In [19]:
#Splitting the data into independent and dependent variables
X, y = dfOHE.loc[:, dfOHE.columns != 'Major_Occupation'], dfOHE['Major_Occupation']

In [20]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [21]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [23]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5 
reversefactor = dict(zip(range(5),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations    1    2    3    4   5
Actual Occupations                           
1                      126   48  177  175  26
2                       55  365  113   61  14
3                       94   69  285   50  13
4                      131   46   74  283  37
5                       69   16   95   90  15


In [24]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodelMO.pkl') 

[('Years_on_Internet-0', 0.030097659444170293), ('Years_on_Internet-1', 0.04264092075623896), ('Years_on_Internet-2', 0.042671159651809246), ('Years_on_Internet-3', 0.035517588046424534), ('Years_on_Internet-4', 0.04044935306777249), ('Web_Ordering-0', 0.0030203361140386112), ('Web_Ordering-1', 0.02076921914077274), ('Web_Ordering-2', 0.01392956492728964), ('Not_Purchasing_Privacy', 0.0), ('Not_Purchasing_Prefer_people', 0.0), ('Not_Purchasing_Too_complicated', 0.0), ('Not_Purchasing_Easier_locally', 0.0), ('Not_Purchasing_Security', 0.0), ('Age', 0.7709041988514834)]


['randomforestmodelMO.pkl']