# Table of Contents
 <p><div class="lev4 toc-item"><a href="#Transform-our-data" data-toc-modified-id="Transform-our-data-0001"><span class="toc-item-num">0.0.0.1&nbsp;&nbsp;</span>Transform our data</a></div><div class="lev4 toc-item"><a href="#Split-and-test-and-train-data" data-toc-modified-id="Split-and-test-and-train-data-0002"><span class="toc-item-num">0.0.0.2&nbsp;&nbsp;</span>Split and test and train data</a></div><div class="lev4 toc-item"><a href="#Continue-our-example" data-toc-modified-id="Continue-our-example-0003"><span class="toc-item-num">0.0.0.3&nbsp;&nbsp;</span>Continue our example</a></div><div class="lev4 toc-item"><a href="#Make-pipeline" data-toc-modified-id="Make-pipeline-0004"><span class="toc-item-num">0.0.0.4&nbsp;&nbsp;</span>Make pipeline</a></div>

In [1]:
import pandas as pd
## Define path data
COLUMNS = ['age','workclass', 'fnlwgt', 'education',
'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss',
           'hours_week', 'native_country', 'label']
### Define continuous list
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num',
'capital_loss', 'hours_week']
### Define categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation',
'relationship', 'race', 'sex', 'native_country']
## Prepare the data
features = ['age','workclass', 'fnlwgt', 'education',
'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss',
           'hours_week', 'native_country']

In [2]:
#df_train = pd.read_csv("data_inputs/wodetian.csv")

In [3]:
PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

df_train = pd.read_csv(PATH, skipinitialspace=True, names = COLUMNS, index_col=False)
# # convert interested columns to float type

print(CONTI_FEATURES)
for col in CONTI_FEATURES:
    df_train[col] = df_train[col].astype('float64')
df_train.to_csv('wodetian.csv')

['age', 'fnlwgt', 'capital_gain', 'education_num', 'capital_loss', 'hours_week']


In [4]:
df_train.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K


In [5]:
def remove_small_sample(df, col_name, minimum_size=1):
    '''
    Given a dataframe, we look at the col_name column, and get their nuniques for each unique values, if that is small than
    or equal to the minimum_size, then we drop that row. The deletion keeps the index, e.g., reset_index(index=True)
    '''
    count_col = df[col_name].value_counts() 
    temp_col = col_name + "_num"
    df_temp = pd.merge(df, pd.DataFrame(count_col).rename(columns={col_name:temp_col}), left_on = col_name, right_index=True)
    print("Original size:", df_temp.shape)
    df_temp = df_temp.loc[df_temp[temp_col] > minimum_size]
    print("New size:", df_temp.shape)
    return df_temp.drop([temp_col], axis=1)



In [6]:
df_train = remove_small_sample(df_train, "native_country")

Original size: (32561, 16)
New size: (32560, 16)


In [7]:
df_train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K


In [8]:
## Get the column index of the continuous features
conti_features = []
for i in CONTI_FEATURES:
    position = df_train.columns.get_loc(i) 
    conti_features.append(position)
print(conti_features)

[0, 2, 10, 4, 11, 12]


In [9]:
## Get the column index of the categorical features
categorical_features = []
for i in CATE_FEATURES:
    position = df_train.columns.get_loc(i) 
    categorical_features.append(position)
print(categorical_features)

[1, 3, 5, 6, 7, 8, 9, 13]


In [10]:
# Note that, all the categoric columns have different values, in total, we will need 101 groups(columns)
# This will be done with LabelEncoder + OneHotEncoder
# The continous columns are 6, will be kept, no transformation needed
print(df_train[CATE_FEATURES].nunique(),
      'There are',sum(df_train[CATE_FEATURES].nunique()), 
      'groups in the whole dataset')

workclass          9
education         16
marital            7
occupation        15
relationship       6
race               5
sex                2
native_country    41
dtype: int64 There are 101 groups in the whole dataset


#### Transform our data

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

##### Example 1 to show how to label

In [12]:
fruits = ["apple", "banana", "apple", "water melon"]
le = LabelEncoder()
le.fit(fruits)
# must fit it first, then transform
label_data = le.transform(fruits)
print(label_data)
# we can also transform the label data back
print(le.inverse_transform(label_data))

[0 1 0 2]
['apple' 'banana' 'apple' 'water melon']


##### Example 2, train + test

In [13]:
le = LabelEncoder()

train = ["paris", "paris", "tokyo", "amsterdam"]
test = ["tokyo", "tokyo", "paris"]
le.fit(train).transform(test)

array([2, 2, 1])

##### Example 3

Note that the LabelEncoder must be used prior to one-hot encoding, as the OneHotEncoder cannot handle categorical data. Therefore, it is frequently used as pre-cursor to one-hot encoding.

Alternatively, it can encode your target into a usable array. If, for instance, train were your target for classification, you would need a LabelEncoder to use it as your y variable.

In [14]:
df_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K


In [22]:
# This is not needed, but shows how to convert to labelled data
if False:
    le = LabelEncoder()
    for col in CATE_FEATURES:
        df_train[col] = le.fit(df_train[col]).transform(df_train[col]) 

In [16]:
df_train.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,39.0,7,77516.0,9,13.0,4,1,1,4,1,2174.0,0.0,40.0,38,<=50K


#### Split and test and train data

In [12]:
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_week',
 'native_country']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_train[features], 
                                                    df_train.label,
                                                    test_size = 0.2,
                                                    random_state = 0)

In [14]:
X_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country
13936,31.0,Private,369825.0,7th-8th,4.0,Never-married,Handlers-cleaners,Other-relative,White,Male,0.0,0.0,25.0,United-States
17070,34.0,State-gov,98995.0,Assoc-voc,11.0,Never-married,Prof-specialty,Not-in-family,White,Female,0.0,0.0,40.0,United-States
27752,52.0,Private,113094.0,Bachelors,13.0,Separated,Adm-clerical,Unmarried,White,Female,0.0,1092.0,40.0,United-States


#### Continue our example

In [16]:
# note, the author made a mistake here
preprocess = make_column_transformer(
    (StandardScaler(), conti_features),
    ### Need to be numeric not string to specify columns name.  -- why add this ? new version seems to work. by Jun
    (OneHotEncoder(sparse=False), categorical_features)
)



#### Make pipeline

In [18]:
model = make_pipeline(
    preprocess,
    LogisticRegression())

In [20]:
model.fit(X_train, Y_train)
print("logistic regression score: %f" % model.score(X_test,
Y_test))

logistic regression score: 0.860872


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
model.predict_proba(X_test)

array([[0.394678  , 0.605322  ],
       [0.9945504 , 0.0054496 ],
       [0.98847822, 0.01152178],
       ...,
       [0.7110207 , 0.2889793 ],
       [0.09100602, 0.90899398],
       [0.99842869, 0.00157131]])