In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs, make_moons
from sklearn.preprocessing import StandardScaler
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn as mg

In [2]:
import os

print(mg.datasets.DATA_PATH)

/home/arti/anaconda3/lib/python3.8/site-packages/mglearn/data


In [3]:
data = pd.read_csv(os.path.join(mg.datasets.DATA_PATH, 'adult.data'),
                  header=None, index_col=False,
                  names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                          'martial-status', 'occupation', 'relationship', 'race', 'gender',
                          'capital-gain', 'capital-loss', 'hours-per-week', 'native-coutry', 'income'])

data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [4]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [5]:
list(data.columns)

['age',
 'workclass',
 'education',
 'gender',
 'hours-per-week',
 'occupation',
 'income']

In [7]:
data_dummies = pd.get_dummies(data)
list(data_dummies.columns)

['age',
 'hours-per-week',
 'workclass_ ?',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Never-worked',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'workclass_ Without-pay',
 'education_ 10th',
 'education_ 11th',
 'education_ 12th',
 'education_ 1st-4th',
 'education_ 5th-6th',
 'education_ 7th-8th',
 'education_ 9th',
 'education_ Assoc-acdm',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Preschool',
 'education_ Prof-school',
 'education_ Some-college',
 'gender_ Female',
 'gender_ Male',
 'occupation_ ?',
 'occupation_ Adm-clerical',
 'occupation_ Armed-Forces',
 'occupation_ Craft-repair',
 'occupation_ Exec-managerial',
 'occupation_ Farming-fishing',
 'occupation_ Handlers-cleaners',
 'occupation_ Machine-op-inspct',
 'occupation_ Other-service',
 'occupation_ Priv-house-serv',
 'occupation_ Prof-specialty',


In [8]:
display(data_dummies.head())

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [9]:
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

X = features.values
y = data_dummies['income_ >50K'].values

print(X.shape)

(32561, 44)


In [10]:
print(y.shape)

(32561,)


In [11]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print(logreg.score(X_test, y_test))

0.8067804937968308


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
demo_df = pd.DataFrame({'Num Feature': [0, 1, 2, 1],
                       'Cate Feature': ['Socks', 'Fox', 'Scoks', 'Box']})
display(demo_df)

Unnamed: 0,Num Feature,Cate Feature
0,0,Socks
1,1,Fox
2,2,Scoks
3,1,Box


In [14]:
display(pd.get_dummies(demo_df))

Unnamed: 0,Num Feature,Cate Feature_Box,Cate Feature_Fox,Cate Feature_Scoks,Cate Feature_Socks
0,0,0,0,0,1
1,1,0,1,0,0
2,2,0,0,1,0
3,1,1,0,0,0


In [15]:
demo_df['Num Feature'] = demo_df['Num Feature'].astype(str)
display(pd.get_dummies(demo_df, columns=['Num Feature', 'Cate Feature']))

Unnamed: 0,Num Feature_0,Num Feature_1,Num Feature_2,Cate Feature_Box,Cate Feature_Fox,Cate Feature_Scoks,Cate Feature_Socks
0,1,0,0,0,0,0,1
1,0,1,0,0,1,0,0
2,0,0,1,0,0,1,0
3,0,1,0,1,0,0,0
