In [32]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,  mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from math import sqrt

In [2]:
# Read data from ICS and save as DF
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(path,
                names = ["age", "workclass","fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country","> or < 50K"],
                index_col=False,
                na_values=[" ?"])

In [3]:
df.shape

(32561, 15)

In [4]:
df.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,> or < 50K
5940,53,Private,152373,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,0,0,48,United-States,>50K
22002,32,Private,228357,Assoc-voc,11,Divorced,Other-service,Unmarried,White,Female,0,0,40,,<=50K
21207,26,Private,202091,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,<=50K
25365,36,Private,185394,10th,6,Never-married,Handlers-cleaners,Not-in-family,White,Female,0,0,34,United-States,<=50K
23520,22,Federal-gov,262819,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
16480,35,Private,172186,Some-college,10,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
30704,37,Private,29145,Assoc-voc,11,Never-married,Other-service,Not-in-family,White,Male,0,0,40,United-States,<=50K
27387,30,Private,182714,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,40,England,<=50K
24117,28,Private,282389,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Male,0,0,60,United-States,<=50K
6253,42,Private,113324,HS-grad,9,Widowed,Sales,Unmarried,White,Male,0,0,40,United-States,<=50K


In [5]:
df['race'].value_counts()

 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

In [6]:
df['marital-status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64

In [7]:
# Create dummy variables for sex
df=pd.get_dummies(df, columns = ['sex'], prefix='', prefix_sep='')
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', '> or < 50K',
       ' Female', ' Male'],
      dtype='object')

In [8]:
# Create dummy variables for sex
df=pd.get_dummies(df, columns = ['race'], prefix='', prefix_sep='')
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', '> or < 50K',
       ' Female', ' Male', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander',
       ' Black', ' Other', ' White'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,native-country,> or < 50K,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,2174,0,40,United-States,<=50K,0,1,0,0,0,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,0,0,13,United-States,<=50K,0,1,0,0,0,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,0,0,40,United-States,<=50K,0,1,0,0,0,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,0,0,40,United-States,<=50K,0,1,0,0,1,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,0,0,40,Cuba,<=50K,1,0,0,0,1,0,0


In [10]:
df['workclass'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [11]:
df['relationship'].value_counts()

 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64

In [12]:
# Create dummy variables for relationship
df=pd.get_dummies(df, columns = ['relationship'], prefix='', prefix_sep='')
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country', '> or < 50K', ' Female', ' Male',
       ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
       ' Unmarried', ' Wife'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,capital-gain,capital-loss,hours-per-week,...,Asian-Pac-Islander,Black,Other,White,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,2174,0,40,...,0,0,0,1,0,1,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,0,0,13,...,0,0,0,1,1,0,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,0,0,40,...,0,0,0,1,0,1,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,0,0,40,...,0,1,0,0,1,0,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,0,0,40,...,0,1,0,0,0,0,0,0,0,1


In [14]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country', '> or < 50K', ' Female', ' Male',
       ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
       ' Unmarried', ' Wife'],
      dtype='object')

In [15]:
features = ['age', 'education-num', 'capital-gain', 'capital-gain', 'capital-loss', 'hours-per-week', ' Female', ' Male',' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife']

In [16]:
# Initiate the scaler (create an instance of the sklearn class
std_scaler = StandardScaler()
std_scaler

In [17]:
# Fit the scaler to our data
scaled_results = std_scaler.fit_transform(df[features])
scaled_results[:2]

array([[ 0.03067056,  1.13473876,  0.1484529 ,  0.1484529 , -0.21665953,
        -0.03542945, -0.70307135,  0.70307135, -0.09820087, -0.18155194,
        -0.32576824, -0.09161163,  0.4130197 , -0.82533335,  1.70899099,
        -0.17624972, -0.42934582, -0.34403232, -0.22492681],
       [ 0.83710898,  1.13473876, -0.14592048, -0.14592048, -0.21665953,
        -2.22215312, -0.70307135,  0.70307135, -0.09820087, -0.18155194,
        -0.32576824, -0.09161163,  0.4130197 ,  1.21163164, -0.58514059,
        -0.17624972, -0.42934582, -0.34403232, -0.22492681]])

In [18]:
# convert those results into a dataframe
scaled_df = pd.DataFrame(scaled_results, columns=features)
scaled_df.head()

Unnamed: 0,age,education-num,capital-gain,capital-gain.1,capital-loss,hours-per-week,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
0,0.030671,1.134739,0.148453,0.148453,-0.21666,-0.035429,-0.703071,0.703071,-0.098201,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,1.708991,-0.17625,-0.429346,-0.344032,-0.224927
1,0.837109,1.134739,-0.14592,-0.14592,-0.21666,-2.222153,-0.703071,0.703071,-0.098201,-0.181552,-0.325768,-0.091612,0.41302,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
2,-0.042642,-0.42006,-0.14592,-0.14592,-0.21666,-0.035429,-0.703071,0.703071,-0.098201,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,1.708991,-0.17625,-0.429346,-0.344032,-0.224927
3,1.057047,-1.197459,-0.14592,-0.14592,-0.21666,-0.035429,-0.703071,0.703071,-0.098201,-0.181552,3.069667,-0.091612,-2.421192,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
4,-0.775768,1.134739,-0.14592,-0.14592,-0.21666,-0.035429,1.422331,-1.422331,-0.098201,-0.181552,3.069667,-0.091612,-2.421192,-0.825333,-0.585141,-0.17625,-0.429346,-0.344032,4.445891


In [19]:
#Drop original columns and merge scaled df
df = df.drop(features, axis=1)
df = pd.concat([df, scaled_df], axis=1)
df.sample(10)

Unnamed: 0,workclass,fnlwgt,education,marital-status,occupation,native-country,> or < 50K,age,education-num,capital-gain,...,Asian-Pac-Islander,Black,Other,White,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
12134,State-gov,77651,HS-grad,Married-civ-spouse,Craft-repair,United-States,<=50K,1.057047,-0.42006,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
31278,Private,122206,Assoc-voc,Married-civ-spouse,Sales,United-States,<=50K,0.763796,0.35734,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
4506,Private,181020,HS-grad,Never-married,Handlers-cleaners,United-States,<=50K,-0.26258,-0.42006,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,-0.585141,-0.17625,2.329125,-0.344032,-0.224927
29311,Private,198751,Bachelors,Never-married,Craft-repair,Vietnam,<=50K,-0.55583,1.134739,-0.14592,...,5.508066,-0.325768,-0.091612,-2.421192,-0.825333,1.708991,-0.17625,-0.429346,-0.344032,-0.224927
24465,Private,187720,Masters,Married-civ-spouse,Sales,,>50K,0.250608,1.523438,1.888424,...,-0.181552,-0.325768,-0.091612,0.41302,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
30611,Private,316769,11th,Never-married,Other-service,Jamaica,<=50K,-0.482518,-1.197459,-0.14592,...,-0.181552,3.069667,-0.091612,-2.421192,-0.825333,-0.585141,-0.17625,-0.429346,2.906704,-0.224927
12392,Private,304602,HS-grad,Never-married,Handlers-cleaners,United-States,<=50K,-1.069018,-0.42006,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,-0.585141,-0.17625,2.329125,-0.344032,-0.224927
16046,Private,118303,HS-grad,Widowed,Sales,United-States,>50K,1.42361,-0.42006,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,1.708991,-0.17625,-0.429346,-0.344032,-0.224927
28246,Self-emp-not-inc,180446,Some-college,Married-civ-spouse,Tech-support,United-States,>50K,0.617171,-0.03136,-0.14592,...,-0.181552,3.069667,-0.091612,-2.421192,1.211632,-0.585141,-0.17625,-0.429346,-0.344032,-0.224927
15623,Private,233571,Some-college,Divorced,Exec-managerial,United-States,<=50K,-0.189267,-0.03136,-0.14592,...,-0.181552,-0.325768,-0.091612,0.41302,-0.825333,1.708991,-0.17625,-0.429346,-0.344032,-0.224927


In [20]:
print(df.isnull().sum())

workclass              1836
fnlwgt                    0
education                 0
marital-status            0
occupation             1843
native-country          583
> or < 50K                0
age                       0
education-num             0
capital-gain              0
capital-gain              0
capital-loss              0
hours-per-week            0
 Female                   0
 Male                     0
 Amer-Indian-Eskimo       0
 Asian-Pac-Islander       0
 Black                    0
 Other                    0
 White                    0
 Husband                  0
 Not-in-family            0
 Other-relative           0
 Own-child                0
 Unmarried                0
 Wife                     0
dtype: int64


In [21]:
#drop NA values
df.dropna(axis=0,how='any',inplace=True)
print(df.isnull().sum())

workclass              0
fnlwgt                 0
education              0
marital-status         0
occupation             0
native-country         0
> or < 50K             0
age                    0
education-num          0
capital-gain           0
capital-gain           0
capital-loss           0
hours-per-week         0
 Female                0
 Male                  0
 Amer-Indian-Eskimo    0
 Asian-Pac-Islander    0
 Black                 0
 Other                 0
 White                 0
 Husband               0
 Not-in-family         0
 Other-relative        0
 Own-child             0
 Unmarried             0
 Wife                  0
dtype: int64


In [22]:
print(df.isnull().sum())

workclass              0
fnlwgt                 0
education              0
marital-status         0
occupation             0
native-country         0
> or < 50K             0
age                    0
education-num          0
capital-gain           0
capital-gain           0
capital-loss           0
hours-per-week         0
 Female                0
 Male                  0
 Amer-Indian-Eskimo    0
 Asian-Pac-Islander    0
 Black                 0
 Other                 0
 White                 0
 Husband               0
 Not-in-family         0
 Other-relative        0
 Own-child             0
 Unmarried             0
 Wife                  0
dtype: int64


In [23]:
df.columns

Index(['workclass', 'fnlwgt', 'education', 'marital-status', 'occupation',
       'native-country', '> or < 50K', 'age', 'education-num', 'capital-gain',
       'capital-gain', 'capital-loss', 'hours-per-week', ' Female', ' Male',
       ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
       ' Unmarried', ' Wife'],
      dtype='object')

In [24]:
#Establish the target variable
y = df['> or < 50K']

In [25]:
#Establish the matrix of features
X = df[['age', 'education-num', 'capital-gain',
       'capital-gain', 'capital-loss', 'hours-per-week', ' Female', ' Male',
       ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
       ' White', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
       ' Unmarried', ' Wife']]

In [26]:
# instantiate logistic regression
logreg = LogisticRegression()

In [27]:
# Test / Train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [28]:
# fit your model on the training dataset
logreg.fit(X_train, y_train)

In [29]:
# predict on the testing dataset
y_preds=logreg.predict(X_test)
print('preds', list(y_preds[:10]))
print('truth', list(y_test[:10]))

preds [' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K']
truth [' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K']


In [30]:
# does it match y-test?
print(len(y_preds))
print(len(y_test))

7541
7541


In [33]:
print(accuracy_score(y_preds, y_test))

0.8390133934491447


In [34]:
print(y_train.value_counts())

 <=50K    16994
 >50K      5627
Name: > or < 50K, dtype: int64
