In [165]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
import pandas as pd
import numpy as np
import warnings
from itertools import combinations
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
from pandas.api.types import CategoricalDtype
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [166]:
columns_name = ["age", "workClass", "fnlwgt", "education", "education.num",
                "marital.status", "occupation", "relationship", "race", "sex", 
                "capital_gain", "capital_loss", "hours_perweek", "native_country", "income"]

train_df = pd.read_csv('../data/adult.data', names=columns_name, sep=' *, *')
test_df = pd.read_csv('../data/adult.test',  names=columns_name, sep=' *, *', skiprows=1)

# ----------------------------preprocessing--------------------------------

1. concate train_df and test_df

In [167]:
test_X = test_df.drop('income', axis=1)

In [168]:
df = pd.concat([train_df, test_X], ignore_index=True)

2. Explaced formate error "?" to be None

In [169]:
df[df == '?'] = np.nan

3. The dependent column ‘income’ which is to be predicted has been replaced with 0 and 1 and hence convert the problem to a dichotomo

In [170]:
df['income'].replace({'<=50K':0,'>50K':1}, inplace=True)

In [171]:
Test_y = test_df['income']
Test_y.replace({'<=50K.':0, '>50K.':1}, inplace=True)

4. Delete useless features

In [172]:
df = df.drop('fnlwgt',axis=1)
df = df.drop('education.num', axis=1)

# ----------------------------------Feature engineer--------------------------------------

1. writen a custom transformer which will select the corresponding attributes

In [173]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score

In [174]:
class featureSelector(BaseEstimator, TransformerMixin):
  
    def __init__(self, type):
        self.type = type
  
    def fit(self, X, y=None):
        return self

    def transform(self,X):
        return X.select_dtypes(include=[self.type])

2. Developed the numerical Data Pipeline

In [175]:
class array_df(BaseEstimator, TransformerMixin):
    
    def _init_(self, name=['age', 'capital-gain', 'capital-loss', 'hours.per.week']):
        self.names = name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        d = pd.DataFrame(X, columns = ['age', 'capital-gain', 'capital-loss', 'hours.per.week'])
        return d

In [176]:
num_pipeline = Pipeline(steps=[
    ("num_selector", featureSelector(type='int')),
    ("scaler", StandardScaler()),
    ('arrayToDf', array_df())
])


Complete Numerical Pipeline

In [177]:
num_pipeline = Pipeline(steps=[
    ("num_selector", featureSelector(type='int')),
    ("scaler", StandardScaler()),
    ('arrayToDf', array_df())
])

3. Developed the Categorical Data Pipeline

Replacing the missing values with the most frequently occurring value in each column in the categorical columns.

In [178]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
  
    def __init__(self, columns = None, strategy='most_frequent'):
        self.strategy = strategy
        self.columns = columns
    
    
    def fit(self,X, y=None):
    
        if self.strategy is 'most_frequent':
            self.fill = {col: X[col].value_counts().index[0] for 
            col in self.columns}
    
        if self.columns is None:
            self.columns = X.columns
    
        else:
            self.fill ={col: '0' for col in self.columns}
      
        return self
      
    def transform(self,X):
        X_2 = X.copy()
        for col in self.columns:
            X_2[col] = X_2[col].fillna(self.fill[col])
        return X_2

Used pd.get_dummies to convert the categorical values to numerical values.

In [179]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
  
    def __init__(self, dropFirst=True):
        self.categories = dict()
        self.dropFirst = dropFirst
    
    def fit(self, X, y=None):

        df_cat = df.select_dtypes(include=['object'])
        for col in df_cat.columns:
            self.categories[col] = df_cat[col].value_counts().index.tolist()
        return self
    
    def transform(self, X):
        X_2 = X.copy()
        X_2 = X_2.select_dtypes(include=['object'])
        for col in X_2.columns:
            X_2[col] = X_2[col].astype({col: CategoricalDtype(self.categories[col])})
            
        return pd.get_dummies(X_2, drop_first=self.dropFirst)

Complete Categorical Pipeline

In [180]:
cat_pipeline = Pipeline(steps=[
    ("cat_selector", featureSelector(type='object')),
    ("cat_imputer", CategoricalImputer(columns = ['occupation', 'workClass', 'native_country'])),
    ("cat_encoder", CategoricalEncoder())
])

4. Split df to train data and test data

In [181]:
Train_x = df[df['income'].notnull()].drop('income', axis=1)
Train_y = df[df['income'].notnull()]['income']
Test_x = df[df['income'].isnull()].drop('income', axis=1).reset_index(drop=True)
Test_y = Test_y

5. Completed Pipeline uses pipline

In [182]:
# full_pipeline = FeatureUnion([("num_pipeline", num_pipeline), 
#                 ("cat_pipeline", cat_pipeline)])

Train_x_num = num_pipeline.fit_transform(Train_x)
Train_x_cat = cat_pipeline.fit_transform(Train_x)

In [183]:
Train = pd.concat([Train_x_num, Train_x_cat, Train_y], axis=1)

In [184]:
Test_x_num = num_pipeline.fit_transform(Test_x)
Test_x_cat = cat_pipeline.fit_transform(Test_x)

In [185]:
Test = pd.concat([Test_x_num, Test_x_cat, Test_y], axis=1)

In [186]:
Train.to_csv('./data/Train_LR.csv',index=0)
Test.to_csv('./data/Test_LR.csv',index=0)