# Classification - RandomForest - Major Occupation

In [1]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('occupationFeatureSubset.csv')
dfOHE = pd.read_csv('oheTransformedData.csv')
dfOHE['Major_Occupation'] = pd.Series(originalDF['Major_Occupation'], index=dfOHE.index)
dfOHE.fillna(0, inplace=True)

# Next we check the Major_Occupation options in the dataset for use with classification
print(dfOHE.Major_Occupation.unique())

# Now we replace those values with integers for use with the classification algorithm
occupation_values = {"Major_Occupation": {"Professional": 1, "Education": 2, "Computer": 3, "Other": 4, "Management": 5}}
dfOHE.replace(occupation_values, inplace=True)

Libraries Imported
['Professional' 'Education' 'Computer' 'Other' 'Management']


In [2]:
#Creating the dependent variable class
factor = pd.factorize(dfOHE['Major_Occupation'])
dfOHE['Major_Occupation'] = factor[0]
definitions = factor[1]
print(dfOHE['Major_Occupation'].head())
print(definitions)

0    0
1    1
2    2
3    0
4    1
Name: Major_Occupation, dtype: int64
Int64Index([1, 2, 3, 4, 5], dtype='int64')


In [3]:
#Splitting the data into independent and dependent variables
X, y = dfOHE.loc[:, dfOHE.columns != 'Major_Occupation'], dfOHE['Major_Occupation']

In [4]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [5]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [6]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [7]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5 
reversefactor = dict(zip(range(5),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations    1    2    3    4   5
Actual Occupations                           
1                      134   62  139  140  77
2                       92  339   83   65  29
3                      118   89  187   62  55
4                      141   60   72  238  60
5                       75   36   65   71  38


In [8]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelMO.pkl') 

[('Web_Ordering-0', 0.0017708664758203211), ('Web_Ordering-1', 0.014839329020990643), ('Web_Ordering-2', 0.014860594761882376), ('Years_on_Internet-0', 0.01357686226110848), ('Years_on_Internet-1', 0.017251934149581406), ('Years_on_Internet-2', 0.01959310753255228), ('Years_on_Internet-3', 0.0126640954663845), ('Years_on_Internet-4', 0.023177654198224363), ('Race-0', 0.0059902339435350714), ('Race-1', 0.0061413963710338), ('Race-2', 0.004829092635623404), ('Race-3', 0.0017739407327096522), ('Race-4', 0.0029885356216903048), ('Race-5', 0.0053559028694415405), ('Race-6', 0.007652743643874823), ('Race-7', 0.018139215576776967), ('Marital_Status-0', 0.011652871882658572), ('Marital_Status-1', 0.021898035090357298), ('Marital_Status-2', 0.004361058670094184), ('Marital_Status-3', 0.011854281492766187), ('Marital_Status-4', 0.0046197868328403955), ('Marital_Status-5', 0.03029796765260575), ('Marital_Status-6', 0.004908147990376041), ('Sexual_Preference-0', 0.007851921909903494), ('Sexual_Pre

['randomForest/randomforestmodelMO.pkl']

## PCA Transformation

In [9]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

# Now we should split our data into a training set and a test set in order to properly assess our model using PCA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

  return self.partial_fit(X, y)


In [10]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
pca_model.fit(X_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

# 2-Dimensions
X_train[:5]

array([[-0.7699573 , -0.52624244],
       [-0.48446553,  0.00212533],
       [-0.88943286, -0.10226339],
       [-0.80950549, -0.45917265],
       [ 1.05641303, -0.6577288 ]])

In [11]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations    1    2    3    4   5
Actual Occupations                           
1                      161   84  135  121  64
2                       82  299   83   86  22
3                      120  112  184   62  47
4                      145   69   73  217  70
5                       81   39   68   70  33


In [13]:
print(list(zip(dfOHE, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelMOPCA.pkl')

[('Web_Ordering-0', 0.5071124486335268), ('Web_Ordering-1', 0.4928875513664733)]


['randomForest/randomforestmodelMOPCA.pkl']