# Classification - RandomForest - Major Occupation

In [14]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('occupationFeatureSubset.csv')

# Next we check the Major_Occupation options in the dataset for use with classification
print(originalDF.Major_Occupation.unique())

# Now we replace those values with integers for use with the classification algorithm
occupation_values = {"Major_Occupation": {"Professional": 1, "Education": 2, "Computer": 3, "Other": 4, "Management": 5}}
originalDF.replace(occupation_values, inplace=True)

featureDF = originalDF[originalDF.columns.difference(['Education_Attainment'])]

Libraries Imported
['Professional' 'Education' 'Computer' 'Other' 'Management']


In [15]:
#Creating the dependent variable class
factor = pd.factorize(originalDF['Major_Occupation'])
originalDF['Major_Occupation'] = factor[0]
definitions = factor[1]
print(originalDF['Major_Occupation'].head())
print(definitions)

0    0
1    1
2    2
3    0
4    1
Name: Major_Occupation, dtype: int64
Int64Index([1, 2, 3, 4, 5], dtype='int64')


In [16]:
#Splitting the data into independent and dependent variables
X, y = featureDF, originalDF['Major_Occupation']

In [17]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [18]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [19]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5 
reversefactor = dict(zip(range(5),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations    1    2    3    4    5
Actual Occupations                            
1                      551    1    0    0    0
2                        2  606    0    0    0
3                        0    2  508    1    0
4                        0    0    1  570    0
5                        0    0    0    1  284


In [21]:
print(list(zip(featureDF, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelMO.pkl') 

[('Age', 0.08948121928054076), ('Gender', 0.006496663244159874), ('Major_Geographical_Location', 0.008822070335960993), ('Major_Occupation', 0.7637849073833881), ('Marital_Status', 0.030925637248429207), ('Opinions_on_Censorship', 0.010020164124644263), ('Race', 0.0059264698315498136), ('Registered_to_Vote', 0.006791925422551544), ('Sexual_Preference', 0.006751874504437737), ('Unnamed: 0', 0.03273216996222705), ('Web_Ordering', 0.007572573741081656), ('Years_on_Internet', 0.030694324921028925)]


['randomForest/randomforestmodelMO.pkl']

## PCA Transformation

In [22]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

# Now we should split our data into a training set and a test set in order to properly assess our model using PCA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

  return self.partial_fit(X, y)


In [23]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
pca_model.fit(X_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

# 2-Dimensions
X_train[:5]

array([[-0.45800606, -0.3818205 ],
       [-0.27939691, -0.30053969],
       [-0.4158031 ,  0.10002422],
       [-0.39586984,  0.26129094],
       [ 0.54280963, -0.16812243]])

In [24]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [25]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Occupations'], colnames=['Predicted Occupations']))

Predicted Occupations    1    2    3    4    5
Actual Occupations                            
1                      287  108   81   60   29
2                      111  322   71   54   14
3                      108   86  239   65   27
4                       63   64   86  301   60
5                       29   29   41   81  111


In [26]:
print(list(zip(featureDF, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelMOPCA.pkl')

[('Age', 0.4982645549775929), ('Gender', 0.5017354450224071)]


['randomForest/randomforestmodelMOPCA.pkl']