Load in data and check class ratio

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, Imputer
from sklearn import svm
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestRegressor

In [111]:
salaries = pd.read_csv("C:/Users/Nermin/Downloads/output/Salaries.csv")
salaries.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411,0.0,400184.0,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966,245132.0,137811.0,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739,106088.0,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916,56120.7,198307.0,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134402,9737.0,182235.0,,326373.19,326373.19,2011,,San Francisco,


In [112]:
#salaries.Status[salaries.Status == '']
#salaries.ix[:,'Notes']
# all Notes are nans, so we delete the column
print salaries.Notes.unique()
# There are PT,FT, and Nan
salaries.Status.unique()

[ nan]


array([nan, 'PT', 'FT'], dtype=object)

In [113]:
print salaries.shape
salaries.info()
any(salaries.isnull())
# all the columns that have less than 148654 non-null rows, have NaNs

(148654, 13)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 148654 entries, 0 to 148653
Data columns (total 13 columns):
Id                  148654 non-null int64
EmployeeName        148654 non-null object
JobTitle            148654 non-null object
BasePay             148049 non-null object
OvertimePay         148654 non-null object
OtherPay            148654 non-null object
Benefits            112495 non-null object
TotalPay            148654 non-null float64
TotalPayBenefits    148654 non-null float64
Year                148654 non-null int64
Notes               0 non-null float64
Agency              148654 non-null object
Status              38119 non-null object
dtypes: float64(3), int64(2), object(8)
memory usage: 15.9+ MB


True

In [114]:
# Status and Notes columns have too many NaNs to be useful so we take them out
salaries = salaries.drop(['Status', 'Notes'], axis=1)
salaries.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Agency
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411,0.0,400184.0,,567595.43,567595.43,2011,San Francisco
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966,245132.0,137811.0,,538909.28,538909.28,2011,San Francisco
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739,106088.0,16452.6,,335279.91,335279.91,2011,San Francisco
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916,56120.7,198307.0,,332343.61,332343.61,2011,San Francisco
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134402,9737.0,182235.0,,326373.19,326373.19,2011,San Francisco


In [115]:
# column has all only one value so its useless and we take it out
print salaries.Agency.unique()
salaries = salaries.drop(['Agency'], axis=1)
salaries.head()

['San Francisco']


Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411,0.0,400184.0,,567595.43,567595.43,2011
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966,245132.0,137811.0,,538909.28,538909.28,2011
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739,106088.0,16452.6,,335279.91,335279.91,2011
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916,56120.7,198307.0,,332343.61,332343.61,2011
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134402,9737.0,182235.0,,326373.19,326373.19,2011


In [116]:
# using only the rows where BasePay is not null
salaries = salaries[salaries.BasePay.isnull() == False]

In [117]:
# getting class column into numpy array
y = salaries.BasePay.values
# taking class column out of pandas dataframe
salaries = salaries.drop('BasePay', axis=1)

Obviously the data is not normalized, as there are values that are outside of the 0-1 range. 

In [118]:
# Normalize data
# taking out ID column
salaries = salaries.drop(['Id', 'EmployeeName'], axis=1)
salaries.head()

Unnamed: 0,JobTitle,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year
0,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,0.0,400184.0,,567595.43,567595.43,2011
1,CAPTAIN III (POLICE DEPARTMENT),245132.0,137811.0,,538909.28,538909.28,2011
2,CAPTAIN III (POLICE DEPARTMENT),106088.0,16452.6,,335279.91,335279.91,2011
3,WIRE ROPE CABLE MAINTENANCE MECHANIC,56120.7,198307.0,,332343.61,332343.61,2011
4,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",9737.0,182235.0,,326373.19,326373.19,2011


In [124]:
salaries.dtypes

JobTitle             object
OvertimePay         float64
OtherPay            float64
Benefits            float64
TotalPay            float64
TotalPayBenefits    float64
Year                  int64
dtype: object

In [120]:
print salaries.OvertimePay.unique()
print salaries.Benefits.unique()
print salaries.OtherPay.unique()

[0.0 245131.88 106088.18 ..., '102.70' '56.77' 'Not Provided']
[nan 44430.12 69810.19 ..., '0.13' '1.24' 'Not Provided']
[400184.25 137811.38 16452.6 ..., '13.35' 'Not Provided' '-618.13']


In [121]:
# converting the Not Provided to NaNs
salaries.ix[salaries.OvertimePay == 'Not Provided', 'OvertimePay'] = np.nan
salaries.ix[salaries.Benefits == 'Not Provided', 'Benefits'] = np.nan
salaries.ix[salaries.OtherPay == 'Not Provided', 'OtherPay'] = np.nan

In [122]:
# changing all continuous variables to floats
salaries.OvertimePay = salaries.OvertimePay.astype(float)
salaries.Benefits = salaries.Benefits.astype(float)
salaries.OtherPay = salaries.OtherPay.astype(float)

In [None]:
##### performing normalization on the numeric features
### Need to turn into a numpy array first
dataset_array = dataset.as_matrix()
# Getting numeric features
dataset_numerics = dataset_array[:,2:]
# Delete those numeric features from the original dataset
dataset_array = np.delete(dataset_array,[2,3,4,5,6,7], axis=1)
# Cast dataset from a mix of ints and floats to all floats
dataset_numerics = dataset_numerics.astype(float)
# Normalize the numeric features
dataset_numerics_norm = preprocessing.normalize(dataset_numerics)
# Put the normalized numeric features back into the dataset array
dataset_array = np.append(dataset_array, dataset_numerics_norm, axis=1)
dataset_array[:10,:] # look at first ten rows

In [None]:
# Creating Dummy (Indicator) variables for year and job title features


In [123]:
salaries.head()

Unnamed: 0,JobTitle,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year
0,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,0.0,400184.25,,567595.43,567595.43,2011
1,CAPTAIN III (POLICE DEPARTMENT),245131.88,137811.38,,538909.28,538909.28,2011
2,CAPTAIN III (POLICE DEPARTMENT),106088.18,16452.6,,335279.91,335279.91,2011
3,WIRE ROPE CABLE MAINTENANCE MECHANIC,56120.71,198306.9,,332343.61,332343.61,2011
4,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",9737.0,182234.59,,326373.19,326373.19,2011


In [156]:
#over_sample.Bare_Nuclei.astype(long)
h = over_sample.Bare_Nuclei[over_sample.Bare_Nuclei == '?']

In [157]:
# converting the '?'s to NaNs
over_sample.ix[over_sample.Bare_Nuclei == '?', 'Bare_Nuclei'] = np.nan

In [158]:
# converting to Numpy array
X = over_sample.values

In [159]:
# performing imputation
# creating imputor object
imp = Imputer()
# imputing the data using mean imputation
X = imp.fit_transform(X)

In [160]:
# normalizing data
X = normalize(X)

In [161]:
X[:2,:]

array([[ 0.21594473,  0.21594473,  0.43188945,  0.2879263 ,  0.35990788,
         0.5758526 ,  0.2879263 ,  0.2879263 ,  0.07198158,  0.07198158],
       [ 0.36066785,  0.36066785,  0.40575134,  0.27050089,  0.27050089,
         0.13525045,  0.45083482,  0.45083482,  0.04508348,  0.04508348]])

In [164]:
# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Using SVM algorithm

In [195]:
# getting the SVM algorithm object
model = svm.SVC(kernel='rbf')
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

0.88737352370879619

In [None]:


clf = RandomForestRegressor()

In [196]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [197]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[128  15]
 [ 12 148]]
The accuracy is 0.910891089109


In [180]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       0.91      0.90      0.90       143
          1       0.91      0.93      0.92       160

avg / total       0.91      0.91      0.91       303



In [182]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

0.91005244755244763

Decision Tree algorithm

In [198]:
# getting the Decision tree algorithm object
model = DecisionTreeClassifier()
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

1.0

In [199]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [200]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[143   0]
 [  0 160]]
The accuracy is 1.0


In [201]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       143
          1       1.00      1.00      1.00       160

avg / total       1.00      1.00      1.00       303



In [202]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

1.0

Since the Decision Tree had 100% accuracy, the results are better than the SVM with a linear kernel which had a 91% accuracy.

Random Forest Algorithm

In [203]:
# getting the Random Forest algorithm object
model = RandomForestClassifier()
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

1.0

In [204]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [205]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[143   0]
 [  0 160]]
The accuracy is 1.0


In [206]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       143
          1       1.00      1.00      1.00       160

avg / total       1.00      1.00      1.00       303



In [207]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

1.0

The Random Forest also had an accuracy of 100%, which is again better than the SVM with a linear kernel.