In [1]:
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

### logistic regression using one-hot-encoded features

In [2]:
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
                "fnlwgt",
                "age",
                "capital.gain",
                "capital.loss",
                "hours.per.week"
                ]
    
    # drop numerical columns
    df = df.drop(num_cols, axis=1)
    
    # map targets to 0s and 1s
    target_mapping = {
                        "<=50K": 0,
                        ">50K": 1
                        }
    
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns are features except income and kfold columns
    features = [
                    f for f in df.columns if f not in ("kfold", "income")
                ]
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
                    [df_train[features], df_valid[features]],
                    axis=0
                )
    
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [3]:
for fold_ in range(5):
    run(fold_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold = 0, AUC = 0.8794834201695059


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold = 1, AUC = 0.8876246873142462


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold = 2, AUC = 0.8852609687685753


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold = 3, AUC = 0.8681225903589591
Fold = 4, AUC = 0.8728581541840037


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### xgboost with label-encoded features

In [4]:
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

In [7]:
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
                "fnlwgt",
                "age",
                "capital.gain",
                "capital.loss",
                "hours.per.week"
                ]
    
    # drop numerical columns
    df = df.drop(num_cols, axis=1)
    
    # map targets to 0s and 1s
    target_mapping = {
                        "<=50K": 0,
                        ">50K": 1
                        }
    
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns are features except kfold & income columns
    features = [
                    f for f in df.columns if f not in ("kfold", "income")
                ]
    
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize xgboost model
    model = xgb.XGBClassifier(
                                n_jobs=-1,
                                max_depth=7,
                                n_estimators=200
                            )
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [8]:
for fold_ in range(5):
    run(fold_)

Fold = 0, AUC = 0.8764108944332032
Fold = 1, AUC = 0.8848888159632786
Fold = 2, AUC = 0.8816601162613102
Fold = 3, AUC = 0.8662335762581732
Fold = 4, AUC = 0.8698983461709927


### xgboost model with numerical features

In [9]:
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

In [10]:
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
                "fnlwgt",
                "age",
                "capital.gain",
                "capital.loss",
                "hours.per.week"
                ]
    
    # map targets to 0s and 1s
    target_mapping = {
                "<=50K": 0,
                ">50K": 1
                }
    
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns are features except kfold & income columns
    features = [
                    f for f in df.columns if f not in ("kfold", "income")
                ]
    
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesnt matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
            
    # now its time to label encode the features
    for col in features:
        if col not in num_cols:
            # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()
            
            # fit label encoder on all data
            lbl.fit(df[col])
            
            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
            
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize xgboost model
    model = xgb.XGBClassifier(
                            n_jobs=-1
                            )
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [11]:
for fold_ in range(5):
    run(fold_)

Fold = 0, AUC = 0.9209790185449889
Fold = 1, AUC = 0.9247157449144706
Fold = 2, AUC = 0.9269329887598243
Fold = 3, AUC = 0.9119349082169275
Fold = 4, AUC = 0.9166408030141667


### take all the categorical columns and create all combinations of degree two

In [12]:
import itertools
import pandas as pd

import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

In [13]:
def feature_engineering(df, cat_cols):
    """
    This function is used for feature engineering
    :param df: the pandas dataframe with train/test data
    :param cat_cols: list of categorical columns
    :return: dataframe with new features
    """
    
    # this will create all 2-combinations of values
    # in this list
    # for example:
    # list(itertools.combinations([1,2,3], 2)) will return
    # [(1, 2), (1, 3), (2, 3)]
    combi = list(itertools.combinations(cat_cols, 2))
    
    for c1, c2 in combi:
        df.loc[
        :,
        c1 + "_" + c2
        ] = df[c1].astype(str) + "_" + df[c2].astype(str)
        
    return df

In [16]:
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
                "fnlwgt",
                "age",
                "capital.gain",
                "capital.loss",
                "hours.per.week"
                ]
    
    # map targets to 0s and 1s
    target_mapping = {
                "<=50K": 0,
                ">50K": 1
                }
    
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # list of categorical columns for feature engineering
    cat_cols = [
                c for c in df.columns if c not in num_cols
                and c not in ("kfold", "income")
                ]
    
    # add new features
    df = feature_engineering(df, cat_cols)
    
    # all columns are features except kfold & income columns
    features = [
                f for f in df.columns if f not in ("kfold", "income")
                ]
    
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesnt matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # now its time to label encode the features
    for col in features:
        if col not in num_cols:
            # initialize LabelEncoder for each feature column
            lbl = preprocessing.LabelEncoder()
            
            # fit label encoder on all data
            lbl.fit(df[col])
            
            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
            
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize xgboost model
    model = xgb.XGBClassifier(
                n_jobs=-1,
                max_depth=7
                )
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
    
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

In [17]:
for fold_ in range(5):
    run(fold_)

Fold = 0, AUC = 0.9286668430204137
Fold = 1, AUC = 0.9329340656165378
Fold = 2, AUC = 0.9319817543218744
Fold = 3, AUC = 0.919046187194538
Fold = 4, AUC = 0.9245692057162671
