In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn
import requests, zipfile, io

In [11]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

adult = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
adult.columns=['age','workclass','fnlwgt','education','education-num','martial-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','flg-50K']

print('데이터 형태 : {}'.format(adult.shape))
print('결측값 수 : {}'.format(adult.isnull().sum().sum()))
adult.head()

데이터 형태 : (32561, 15)
결측값 수 : 0


Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
adult.groupby('flg-50K').size()

flg-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [13]:
adult['fin_flg'] = adult['flg-50K'].map(lambda x: 1 if x ==' >50K' else 0)
adult.groupby('fin_flg').size()

fin_flg
0    24720
1     7841
dtype: int64

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
Y = adult['fin_flg']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=12)

model = LogisticRegression()
model.fit(X_train, Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test, Y_test)))
print(model.coef_)
print('Odds Ratio : {}'.format(np.exp(model.coef_)))

정확도(train) : 0.799
정확도(test) : 0.795
[[-1.12791049e-02 -4.52547480e-06 -2.61853112e-03  3.52343820e-04
   8.18657932e-04]]
Odds Ratio : [[0.98878427 0.99999547 0.99738489 1.00035241 1.00081899]]


In [19]:
from sklearn.preprocessing import StandardScaler

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
Y = adult['fin_flg']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=12)

#Standardization(Scaling)
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train_std, Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train_std,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test_std, Y_test)))
print(model.coef_)
print('Odds Ratio : {}'.format(np.exp(model.coef_)))

정확도(train) : 0.810
정확도(test) : 0.811
[[0.54514739 0.01617127 0.8702732  2.48149005 0.3055757 ]]
Odds Ratio : [[ 1.7248626   1.01630273  2.38756305 11.95907075  1.35740623]]


In [23]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer.data
Y = cancer.target

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.5,random_state=12)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model_std = LogisticRegression()
model_std.fit(X_train_std,Y_train)
model = LogisticRegression()
model.fit(X_train,Y_train)

print('정확도(train) : {:.3f}'.format(model.score(X_train,Y_train)))
print('정확도(test) : {:.3f}'.format(model.score(X_test,Y_test)))
print('std 정확도(train) : {:.3f}'.format(model_std.score(X_train_std,Y_train)))
print('std 정확도(test) : {:.3f}'.format(model_std.score(X_test_std, Y_test)))

정확도(train) : 0.954
정확도(test) : 0.947
std 정확도(train) : 1.000
std 정확도(test) : 0.968


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
