# determine whether a person makes over 50K a year.

This is a regression problem as target variable Salary is '>50K' or '=<50K'

In [76]:
import numpy as np # linear algebra
import pandas as pd # data processing, 

In [77]:
# Reading the daaset, considering the '?' as NA value 
df = pd.read_csv("adult.data",na_values="?", skipinitialspace=True)
# Checking the null counts
print(df.isnull().sum())

39                  0
State-gov        1836
77516               0
Bachelors           0
13                  0
Never-married       0
Adm-clerical     1843
Not-in-family       0
White               0
Male                0
2174                0
0                   0
40                  0
United-States     583
<=50K               0
dtype: int64


In [78]:
# Printing all the columns
print(df.columns)
# From this it looks like column names are not correct, renaming them
df.columns = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num', 'Marital-status', \
              'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', \
              'Hours-per-week', 'Native-country', 'Earning']

Index(['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married',
       'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40',
       'United-States', '<=50K'],
      dtype='object')


In [79]:
rows_before_droppinig = df.shape[0]
# Printing all the columns
print(df.columns)

# Checking the NA values
df.isnull().sum()

Index(['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num',
       'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country',
       'Earning'],
      dtype='object')


Age                  0
Workclass         1836
fnlwgt               0
Education            0
Education-num        0
Marital-status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Capital-gain         0
Capital-loss         0
Hours-per-week       0
Native-country     583
Earning              0
dtype: int64

In [80]:
# Dropping the NA values which is actually 7.368% of total Data
# It is observed that in most of the missing data set, 
# the ‘workclass’ variable and ‘occupation’ variable are missing data together. 
# And the remaining have ‘nativecountry’ variable missing. We could handle the 
# missing values by imputing the data. However, since ‘workclass’, ‘occupation’ 
# and ‘nativecountry’ could potentially be very good predictors of income, 
# imputing may simply skew the model.

In [112]:
df_opt = df.dropna()
rows_after_droppinig = df_opt.shape[0]

# Checking the NA values
print(df_opt.isnull().sum())

# number of rows dropped
diff = rows_before_droppinig - rows_after_droppinig
print("Total number of rows dropped is {} wich is {:.4}% of total rows."   \
              .format(diff,(diff/rows_before_droppinig)*100))
print('-'*50)


Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital-status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Earning           0
dtype: int64
Total number of rows dropped is 2399 wich is 7.368% of total rows.
--------------------------------------------------


['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num',
       'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country',
       'Earning'],
      dtype='object'
      
Out of these 'Workclass', 'Education', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native-country' 
And  'Earning' are the catagorical values.

In [113]:
# applying one hot and lebel encoding on these columns.

In [114]:
# Fist print the unique values of each features.
cat_features = ['Workclass', 'Education', 'Marital-status', 'Occupation', 'Relationship',\
                'Race', 'Sex', 'Native-country', 'Earning' ]
print("Total numbers of unique values in catagorical features are: ")

for feature in cat_features:
    print(feature + "has total : " + str(len(df[feature].unique())))

Total numbers of unique values in catagorical features are: 
Workclasshas total : 9
Educationhas total : 16
Marital-statushas total : 7
Occupationhas total : 15
Relationshiphas total : 6
Racehas total : 5
Sexhas total : 2
Native-countryhas total : 42
Earninghas total : 2


In [115]:
# Applying One hot encoding on only catagorical columns, i.e. preferred_foot
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for feature in cat_features:
    #feature = 'Workclass'
    df_opt[feature] = label_encoder.fit_transform(df_opt[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [116]:
# Data after one hot encodng of all catagorical data
df_opt.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Earning
0,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
1,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
2,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
3,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0
4,37,2,284582,12,14,2,3,5,4,0,0,0,40,38,0


# Splitting the train and test data

In [117]:
# Defining X and y from the Dataset
X = df_opt.iloc[:,:-1]
y = df_opt.Earning


In [118]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.10,random_state=0)

In [119]:
backwardEle(X, y)

###########################Backward Elemination###########################
                            OLS Regression Results                            
Dep. Variable:                Earning   R-squared:                       0.262
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     766.4
Date:                Fri, 14 Sep 2018   Prob (F-statistic):               0.00
Time:                        21:05:17   Log-Likelihood:                -12917.
No. Observations:               30161   AIC:                         2.586e+04
Df Residuals:                   30146   BIC:                         2.599e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [120]:
# Here We can see X14 i.e. "Native-country" is having P value > 0.05
# o better to eleminate this column
X_train.drop(['Native-country'], axis=1, inplace=True)

# Same time drop this column from test set as well
X_test.drop(['Native-country'], axis=1, inplace=True)

X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week
20590,35,2,361888,15,10,0,6,4,4,1,0,0,40
30652,45,2,83064,9,13,2,11,0,4,1,0,0,50
1200,30,1,235271,9,13,2,9,0,4,1,0,0,50
25333,18,2,236069,0,6,4,7,3,2,1,0,0,10
6242,47,2,436770,8,11,4,9,1,4,0,0,0,40


In [121]:
backwardEle(X, y)

###########################Backward Elemination###########################
                            OLS Regression Results                            
Dep. Variable:                Earning   R-squared:                       0.262
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     766.4
Date:                Fri, 14 Sep 2018   Prob (F-statistic):               0.00
Time:                        21:05:18   Log-Likelihood:                -12917.
No. Observations:               30161   AIC:                         2.586e+04
Df Residuals:                   30146   BIC:                         2.599e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [122]:
# After eleminating 'Native-country', there is no more columns which are having P value > 0.05
# i.e. No more eleminations are required.

# Applying models

In [123]:
# Applying Logistic regression
# Logistic Regression 

from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
scor=classifier.score(X_train,y_train)
scor

0.7912982611258473

In [124]:
# Cross validation from SKlearn

from sklearn.cross_validation import cross_val_score
cv = cross_val_score(estimator = classifier, X=X,y=y,scoring='accuracy',cv=50)
print(cv.mean())

0.7880036220844221


In [126]:
# DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
dtree.fit(X_train, y_train)
tscore=dtree.score(X_test, y_test)
print(tscore)

0.8409015578389128


In [134]:
# RandomForestRegressor

# Try different numbers of n_estimators - this will take a while or so

regr_rf = RandomForestClassifier(max_depth=30, random_state=2)
from sklearn.ensemble import RandomForestClassifier

estimators = np.arange(100, 200, 10)
scores = []
for n in estimators:
    regr_rf.set_params(n_estimators=n)
    regr_rf.fit(X_train, y_train)
    scores.append(regr_rf.score(X_test, y_test))
    #print(scores)
#max_sc_idx = scores.index(max(scores))

print(max(scores))

0.8528339410009944


In [137]:
# Bagging
from sklearn.ensemble import BaggingClassifier
bg=BaggingClassifier(DecisionTreeClassifier(),n_estimators=20, max_samples = 0.5, max_features=1.0)
bg.fit(X_train,y_train)
bgscore=bg.score(X_test,y_test)
print(bgscore)

0.848193569771296


In [138]:
# Boosting
from sklearn.ensemble import AdaBoostClassifier
bo=AdaBoostClassifier(n_estimators=50, learning_rate=1.)
bo.fit(X_train,y_train)
boscore=bo.score(X_test,y_test)
print(bgscore)

0.848193569771296


In [140]:
#fitting XGBoost to the Training set
import xgboost
classifier = xgboost.XGBClassifier()
classifier.fit(X_train,y_train)
cscore=classifier.score(X_train,y_train)
print(cscore)

0.8623636899498969


  if diff:
