In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import warnings
warnings.filterwarnings("ignore") # Shhhh

In [2]:
def data_cleaning(df):
    # remove duplicates
    df = df[~df.duplicated()]
    
    # strip blank space in front of  values
    lis = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country', 'wage']
    for col in lis:
        df[col] = df[col].str.strip()
        
    # reformat wage
    df['wage'] = df['wage'].map({'<=50K':0, '>50K':1})
    
    # reformat sex 
    df['sex'] = df['sex'].map({'Male':1, "Female":0})
    
    # reformating education
    df['education'] = df['education'].map({'Preschool':0, '1st-4th':1, '5th-6th':2, '7th-8th':3, '9th': 4, '10th': 5,
                        '11th':6, '12th':7, 'HS-grad': 8, 'Some-college':9, 'Assoc-acdm': 10, 'Assoc-voc': 11,
                        'Bachelors': 12, 'Prof-school': 14, 'Masters': 13, 'Doctorate': 15})
    
    # dumying all non-numeric features 
    df = pd.get_dummies(df)
    return df

In [3]:
def test_data_cleaning(df):
    # strip blank space in front of  values
    lis = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']
    for col in lis:
        df[col] = df[col].str.strip()
    
    # reformat sex 
    df['sex'] = df['sex'].map({'Male':1, "Female":0})
    
    # reformating education
    df['education'] = df['education'].map({'Preschool':0, '1st-4th':1, '5th-6th':2, '7th-8th':3, '9th': 4, '10th': 5,
                        '11th':6, '12th':7, 'HS-grad': 8, 'Some-college':9, 'Assoc-acdm': 10, 'Assoc-voc': 11,
                        'Bachelors': 12, 'Prof-school': 14, 'Masters': 13, 'Doctorate': 15})
    
    # dumying all non-numeric features 
    df = pd.get_dummies(df)
    return df


In [4]:
# Reading data 
df = pd.read_csv('data/large_train_sample.csv')
test_df = pd.read_csv('data/test_data.csv')

In [5]:
# Cleaning df 
df = data_cleaning(df)

In [6]:
df.head()

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,wage,workclass_?,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,12,13,1,2174,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,12,13,1,0,0,13,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,8,9,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,6,7,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,12,13,0,0,0,40,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
for col in df.columns:
    if col not in test_df.columns:
        print(col)

wage
workclass_?
workclass_Federal-gov
workclass_Local-gov
workclass_Never-worked
workclass_Private
workclass_Self-emp-inc
workclass_Self-emp-not-inc
workclass_State-gov
workclass_Without-pay
marital-status_Divorced
marital-status_Married-AF-spouse
marital-status_Married-civ-spouse
marital-status_Married-spouse-absent
marital-status_Never-married
marital-status_Separated
marital-status_Widowed
occupation_?
occupation_Adm-clerical
occupation_Armed-Forces
occupation_Craft-repair
occupation_Exec-managerial
occupation_Farming-fishing
occupation_Handlers-cleaners
occupation_Machine-op-inspct
occupation_Other-service
occupation_Priv-house-serv
occupation_Prof-specialty
occupation_Protective-serv
occupation_Sales
occupation_Tech-support
occupation_Transport-moving
relationship_Husband
relationship_Not-in-family
relationship_Other-relative
relationship_Own-child
relationship_Unmarried
relationship_Wife
native-country_?
native-country_Cambodia
native-country_Canada
native-country_China
native-c

Model Prep

In [24]:
X = df[['capital-gain','marital-status_Married-civ-spouse',
        'education-num','hours-per-week', 'sex',
        'age', 'relationship_Own-child',
        'occupation_Exec-managerial', 'relationship_Wife',
        'capital-loss', 'marital-status_Never-married',
        'occupation_Prof-specialty','occupation_Other-service',
        'occupation_Priv-house-serv', 'occupation_Farming-fishing',
        'education', 'occupation_Sales', 'occupation_Tech-support',
        'occupation_?' ,'relationship_Other-relative']]

ss = StandardScaler()
Xsc = ss.fit_transform(X)

y = df['wage']
X_train, X_test, y_train, y_test = train_test_split(Xsc, y, random_state=42, stratify=y)

In [25]:
X_train.shape

(24402, 20)

Modeling

Baseline

In [10]:
y.value_counts(normalize=True)

0    0.759074
1    0.240926
Name: wage, dtype: float64

Logistic Regression

In [11]:
lr = LogisticRegression(random_state=42, solver = 'liblinear')

params = {
    'penalty' : ['l1', 'l2'],
    'C'       : [.5, 1.0],
}

gs = GridSearchCV(lr,
                 param_grid=params,
                 cv=5)

gs.fit(X_train, y_train)

print('Cross Val :', cross_val_score(gs, Xsc, y, cv=5).mean())
print('Training  :', gs.score(X_train, y_train))
print('Testing   :', gs.score(X_test,y_test))

pred_proba = [i[1] for i in gs.predict_proba(X_test)]
pred_df = pd.DataFrame({'true_values': y_test,
                        'pred_probs':pred_proba})
print('ROC AUC Score:', roc_auc_score(pred_df['true_values'], pred_df['pred_probs']))


Cross Val : 0.8495868616361235
Training  : 0.8480452421932628
Testing   : 0.8540872771972956
ROC AUC Score: 0.9085959679418327


In [12]:
gs.best_params_

{'C': 0.5, 'penalty': 'l2'}

Submission

In [29]:
test_df.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [30]:
X_val = test_df[['capital-gain','marital-status_Married-civ-spouse',
        'education-num','hours-per-week', 'sex',
        'age', 'relationship_Own-child',
        'occupation_Exec-managerial', 'relationship_Wife',
        'capital-loss', 'marital-status_Never-married',
        'occupation_Prof-specialty','occupation_Other-service',
        'occupation_Priv-house-serv', 'occupation_Farming-fishing',
        'education', 'occupation_Sales', 'occupation_Tech-support',
        'occupation_?' ,'relationship_Other-relative']]

In [33]:
# Scaling data 
test_scaled = ss.transform(X_val)

In [34]:
# Getting predictions
preds_val = [i[1] for i in gs.predict_proba(test_scaled)]

In [38]:
preds_col = pd.DataFrame({'wage': preds_val})

In [39]:
preds_col.to_csv('./submission.csv', index = False)

In [40]:
#test_df.shape

In [41]:
#preds_col.shape