# Classification - RandomForest - Major Occupation

In [14]:
# Import feature subset with Major_Occupation Column and one hot encoded values

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
print('Libraries Imported')

originalDF = pd.read_csv('educationFeatureSubset.csv')

# Next we check the Education_Attainment options in the dataset for use with classification
print(originalDF.Education_Attainment.unique())

# Now we replace those values with integers for use with the classification algorithm
education_values = {"Education_Attainment": {"Masters": 1, "Some_College": 2, "College": 3, "High_School": 4, "Professional": 5, "Grammar": 6, "Special": 7, "Doctoral": 8, "Other": 9}}
originalDF.replace(education_values, inplace=True)

featureDF = originalDF[originalDF.columns.difference(['Education_Attainment'])]

Libraries Imported
['Masters' 'Some_College' 'College' 'High_School' 'Professional' 'Grammar'
 'Special' 'Doctoral' 'Other']


With knn, you can determine membership probabilities for each of the 3 labels. As you can see, the predict() function just picks the most likely label.

In [15]:
#Creating the dependent variable class
factor = pd.factorize(originalDF['Education_Attainment'])
originalDF['Education_Attainment'] = factor[0]
definitions = factor[1]
print(originalDF['Education_Attainment'].head())
print(definitions)

0    0
1    1
2    2
3    2
4    3
Name: Education_Attainment, dtype: int64
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [16]:
#Splitting the data into independent and dependent variables
X, y = featureDF, originalDF['Education_Attainment']

In [17]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [18]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [19]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Education'], colnames=['Predicted Education']))

Predicted Education    1    2    3   4  5   6   7  8  9
Actual Education                                       
1                     96   91  133  14  7   0   4  2  1
2                     91  409  209  72  5   2  20  5  2
3                    143  268  248  28  5   1   7  9  1
4                     22  141   60  62  4  12   7  0  2
5                     24   27   17   2  0   0   2  1  0
6                      1    3    0  18  0  15   1  0  1
7                     10   54   28  10  2   0   0  1  0
8                     23   22   18   0  1   1   3  6  1
9                      8   18   11  11  0   3   0  0  1


In [21]:
print(list(zip(featureDF, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelEA.pkl') 

[('Age', 0.2822004856671727), ('Gender', 0.037285420765616084), ('Major_Geographical_Location', 0.03274798761222546), ('Marital_Status', 0.06610209116297065), ('Opinions_on_Censorship', 0.07781155773034668), ('Race', 0.03481496588414419), ('Registered_to_Vote', 0.0375239408098407), ('Sexual_Preference', 0.03891591424235831), ('Unnamed: 0', 0.2861679403503963), ('Web_Ordering', 0.03473139029766987), ('Years_on_Internet', 0.07169830547725906)]


['randomForest/randomforestmodelEA.pkl']

## PCA Transformation

In [22]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

# Now we should split our data into a training set and a test set in order to properly assess our model using PCA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

  return self.partial_fit(X, y)


In [23]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
pca_model.fit(X_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

# 2-Dimensions
X_train[:5]

array([[-0.45453297, -0.33385824],
       [-0.2606989 ,  0.10065705],
       [-0.41473897,  0.1155537 ],
       [-0.40843431,  0.12486152],
       [ 0.53108298, -0.38371677]])

In [24]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [25]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to 1,2,3,4,5,6,7,8,9
reversefactor = dict(zip(range(9),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Education'], colnames=['Predicted Education']))

Predicted Education    1    2    3   4  5  6   7   8  9
Actual Education                                       
1                     67  121  120  18  9  0   8   6  1
2                    114  355  226  91  5  5  18   8  7
3                    124  278  224  51  7  2  16  13  8
4                     41  138   54  47  2  4   5   2  4
5                     13   28   18   8  1  0   2   1  0
6                      4   23    8   9  1  3   0   0  0
7                     13   57   31   8  0  0   2   5  1
8                     14   20   25   1  0  0   1   3  0
9                      4   15    4   3  0  0   1   1  0


In [26]:
print(list(zip(featureDF, classifier.feature_importances_)))
joblib.dump(classifier, 'randomForest/randomforestmodelEAPCA.pkl')

[('Age', 0.4988679768829341), ('Gender', 0.5011320231170657)]


['randomForest/randomforestmodelEAPCA.pkl']