In [3]:
import pandas as pd
import pylab
import numpy as np
import sklearn
from sklearn import linear_model
import sklearn.preprocessing as preprocessing
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from statsmodels.stats import proportion

# Load the UCI adult dataset 

 Download the data for this notebook from Source: https://archive.ics.uci.edu/ml/datasets/Adult

In [4]:
# Source: https://www.valentinmihov.com/2015/04/17/adult-income-data-set/

features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 


train_url = '~/Documents/Teaching/490/data/adult.data'
test_url = '~/Documents/Teaching/490/data/adult.test'

original_train = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?")
original_test = pd.read_csv(test_url, names=features, sep=r'\s*,\s*', 
                            engine='python', na_values="?", skiprows=1)

num_train = len(original_train)
print('Number of training points')
print(num_train)


print('Number of testing points')
print(len(original_test))


original = pd.concat([original_train, original_test])

original.head()



Number of training points
32561
Number of testing points
16281


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Descriptive Analysis

    Let's take a deeper dive to our data. We want to precit income so let's see how it is distributed.

In [5]:
original_less50K = len(original[(original.Target == '<=50K') | (original.Target == '<=50K.')])/len(original)
original_less50K

#do this for yourself on >50k

0.7607182343065395

In [6]:
#HERE
black = original[original['Race']=='Black'] #subgrouping
white = original[original['Race']=='White']
total_black = len(original[original['Race']=='Black'])
total_white = len(original[original['Race']=='White'])

print(total_black)
print(total_white)


black_less50K = len(black[(black.Target == '<=50K') | (black.Target == '<=50K.')])/total_black
black_greater50K = len(black[(black.Target == '>50K') | (black.Target == '>50K.')])/total_black
white_less50K = len(white[(white.Target == '<=50K') | (white.Target == '<=50K.')])/total_white
white_greater50K = len(white[(white.Target == '>50K') | (white.Target == '>50K.')])/total_white

target_percent = pd.DataFrame({'<=50K': [black_less50K, white_less50K], '>50K': [black_greater50K, white_greater50K]}, index=['Black', 'White'])
target_percent

4685
41762


Unnamed: 0,<=50K,>50K
Black,0.879189,0.120811
White,0.746013,0.253987


In [7]:
female = original[original['Sex']=='Female'] #subgrouping
male = original[original['Sex']=='Male']
total_female=len(female)
total_male=len(male)
print(total_female)
female_less50K = len(female[(female.Target == '<=50K') | (female.Target == '<=50K.')])/total_female
female_greater50K = len(female[(female.Target == '>50K') | (female.Target == '>50K.')])/total_female
male_less50K = len(male[(male.Target == '<=50K') | (male.Target == '<=50K.')])/total_male
male_greater50K = len(male[(male.Target == '>50K') | (male.Target == '>50K.')])/total_male

target_percent = pd.DataFrame({'<=50K': [female_less50K, male_less50K], '>50K': [female_greater50K, male_greater50K]}, index=['Female', 'Male'])
target_percent


16192


Unnamed: 0,<=50K,>50K
Female,0.890749,0.109251
Male,0.696233,0.303767


# Check-in point  Build a similar table for gender- Submit it on canvas by end of today. 

# Logistic Regression 

We are going to process our data to make it ready for logistic regression. Unlike linear regression logistic regression uses categorical data. Some important assumptions  of the logistic regression is: 

Logistic Regression Assumptions

    -Binary logistic regression requires the dependent variable to be binary.


    -For a binary regression, the factor level 1 of the dependent variable should represent the desired outcome.


    -Only the meaningful variables should be included.



    -The independent variables should be independent of each other.
    That is, the model should have little or no multicollinearity.



    -The independent variables are linearly related to the log odds.



    -Logistic regression requires quite large sample sizes.


![alt text](a.png)

## Cleaning the data 

Let's first prepare our data such that our target variable is 0 or 1.
By normalizing it and then putting it back into the test and train sets again.
We do this by defining a data_transform function

In [8]:
original.head()
labels = original['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)
print(labels)

0        0
1        0
2        0
3        0
4        0
        ..
16276    0
16277    0
16278    0
16279    0
16280    1
Name: Target, Length: 48842, dtype: int64


In [9]:
# Redundant column
del original["Education"]

# Remove target variable
del original["Target"]

def data_transform(df):
    """Normalize features."""
    binary_data = pd.get_dummies(df)
    feature_cols = binary_data[binary_data.columns[:-2]]
    scaler = preprocessing.StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(feature_cols), columns=feature_cols.columns)
    return data

data = data_transform(original)


train_data = data[:num_train]
train_labels = labels[:num_train]



test_data = data[num_train:]
test_labels = labels[num_train:]

data.head()

Unnamed: 0,Age,fnlwgt,Education-Num,Capital Gain,Capital Loss,Hours per week,Workclass_Federal-gov,Workclass_Local-gov,Workclass_Never-worked,Workclass_Private,...,Country_Philippines,Country_Poland,Country_Portugal,Country_Puerto-Rico,Country_Scotland,Country_South,Country_Taiwan,Country_Thailand,Country_Trinadad&Tobago,Country_United-States
0,0.025996,-1.061979,1.136512,0.146932,-0.217127,-0.034087,-0.173795,-0.26194,-0.01431,-1.50668,...,-0.077952,-0.042243,-0.037063,-0.061494,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.338083
1,0.828308,-1.007104,1.136512,-0.144804,-0.217127,-2.213032,-0.173795,-0.26194,-0.01431,-1.50668,...,-0.077952,-0.042243,-0.037063,-0.061494,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.338083
2,-0.046942,0.246034,-0.419335,-0.144804,-0.217127,-0.034087,-0.173795,-0.26194,-0.01431,0.663711,...,-0.077952,-0.042243,-0.037063,-0.061494,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.338083
3,1.047121,0.426663,-1.197259,-0.144804,-0.217127,-0.034087,-0.173795,-0.26194,-0.01431,0.663711,...,-0.077952,-0.042243,-0.037063,-0.061494,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.338083
4,-0.776316,1.40853,1.136512,-0.144804,-0.217127,-0.034087,-0.173795,-0.26194,-0.01431,0.663711,...,-0.077952,-0.042243,-0.037063,-0.061494,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,-2.957854


In [22]:
cls = linear_model.LogisticRegression()
cls.fit(train_data, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
predictions = cls.predict(test_data)

overall_scores = cls.score(test_data,test_labels)

overall_scores

0.8527117498925127

# Fairness examination 

![alt text](b.png)

# Independece 

Fairness through independence: 

Dempgraphic Parity (group parity): P{C=1,A=a}= P{C=1,A=b}
In this case for example: P{prediction=1, Male} = P{prediction=1,Female}

We can apply some relaxation and say 
P{prediction=1, Male}/ P{prediction=1,Female} >= 1- epsilon

epsilon is usually put to 0.2 

Note what we are calculting is independent of the truth. We just wanna make sure our classifer is equaly accepting 
female and male 

In [12]:
scores = cls.predict_proba(test_data)[:, 1]
d = {'target' : test_labels.values,
     'score' : scores,
     'prediction' : predictions,
     'race' : original_test['Race'],
     'gender' : original_test['Sex']}

marginals = pd.DataFrame(data=d, columns=['target', 'score', 'prediction', 'race', 'gender'])
marginals.head()

Unnamed: 0,target,score,prediction,race,gender
0,0,0.002752,0,Black,Male
1,0,0.1177,0,White,Male
2,1,0.456652,0,White,Male
3,1,0.754372,1,Black,Male
4,0,0.00122,0,White,Female


In [27]:
postive_class= marginals[(marginals['prediction'] == 1) ]

postive_female= postive_class[(postive_class['gender'] == 'Female')]
postive_male= postive_class[(postive_class['gender'] == 'Male')]


#print(len(postive_female)/len(postive_male))


male = marginals[marginals['gender'] == 'Male']
female = marginals[marginals['gender'] == 'Female']

#print(len(postive_female)/len(female))

#print(len(postive_male)/len(male))

print((len(postive_female)/len(female))/(len(postive_male)/len(male)))





0.15041681768756796
0.30133308247315777


# Check-in point

 Calculate the demographic parity for race and do the checkin quiz in canvas. 