# General Imports

In [1]:
import pandas as pd
import numpy as np

# Data Loading

In [2]:
df = pd.read_csv('adult_salary.data', header=None, usecols=[3,4,5,6,8,9,14], 
                names=['EDUCATION', 'EDUCATION_PERIOD', 'STATUS', 'OCCUPY', 'RACE', 'GENDER','RICH'],
                dtype=str)
label_col = 'RICH'
features_cols = [c for c in df.columns if c != label_col]
df['EDUCATION_PERIOD'] = df['EDUCATION_PERIOD'].astype(int)
df[label_col] = df[label_col].apply(lambda x: 1 if x.strip() == '<=50K' else 0).astype(int)
categorial_features = [c for c in df.columns if df.dtypes[c] != np.int32]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   EDUCATION         32561 non-null  object
 1   EDUCATION_PERIOD  32561 non-null  int32 
 2   STATUS            32561 non-null  object
 3   OCCUPY            32561 non-null  object
 4   RACE              32561 non-null  object
 5   GENDER            32561 non-null  object
 6   RICH              32561 non-null  int32 
dtypes: int32(2), object(5)
memory usage: 1.5+ MB


# Data Preparation

In [4]:
from sklearn.preprocessing import LabelBinarizer

feature_encoder_dict = {}
final_df = df.copy()
for feature in categorial_features:
    feature_encoder_dict[feature] = LabelBinarizer()
    final_df[feature] = pd.Series(list(feature_encoder_dict[feature].fit_transform(df[feature])))
final_df

Unnamed: 0,EDUCATION,EDUCATION_PERIOD,STATUS,OCCUPY,RACE,GENDER,RICH
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",13,"[0, 0, 0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[1],1
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",13,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[1],1
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",9,"[1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[1],1
3,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0]",[1],1
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",13,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0]",[0],1
...,...,...,...,...,...,...,...
32556,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",12,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]",[0],1
32557,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",9,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[1],0
32558,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",9,"[0, 0, 0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[0],1
32559,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",9,"[0, 0, 0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",[1],1


In [5]:
from sklearn.model_selection import train_test_split

# split to train and test
train_df, test_df = train_test_split(final_df, test_size=0.1, shuffle=True)

# Modeling

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
from eli5.sklearn import PermutationImportance
from automl_infrastructure.classifiers.adapters import SklearnClassifierAdapter
from eli5.permutation_importance import get_score_importances


lr_model = SklearnClassifierAdapter(name='lr1', sklearn_model=LogisticRegression())
lr_model.fit(train_df[features_cols], train_df[label_col])
predictions = lr_model.predict(test_df[features_cols])
print(accuracy_score(test_df[label_col], predictions))

rf_model = SklearnClassifierAdapter(name='rf1', sklearn_model=RandomForestClassifier())
rf_model.fit(train_df[features_cols], train_df[label_col])
predictions = rf_model.predict(test_df[features_cols])
print(accuracy_score(test_df[label_col], predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8262204482652747
0.8234571691740866


# Permutation Importance Calculation

In [7]:
from automl_infrastructure.interpretation import PermutationImportance


pi = PermutationImportance(lr_model, scoring='accuracy')
pi.fit(test_df[features_cols], test_df[label_col])
pi.show_weights()
print()
pi = PermutationImportance(rf_model, scoring='accuracy')
pi.fit(test_df[features_cols], test_df[label_col])
pi.show_weights()

            Feature    Weight       Std
0            STATUS  0.078191  0.002102
1            OCCUPY  0.026200  0.005696
2  EDUCATION_PERIOD  0.024051  0.003022
3         EDUCATION  0.004503  0.001425
4            GENDER  0.000921  0.000752
5              RACE  0.000307  0.000752

            Feature    Weight       Std
0            STATUS  0.096817  0.002369
1            OCCUPY  0.030908  0.001532
2  EDUCATION_PERIOD  0.008290  0.002469
3            GENDER  0.005322  0.000766
4         EDUCATION  0.004401  0.001044
5              RACE -0.000409  0.001631
