In [20]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

import numpy as np
import pandas as pd

In [21]:
INSURANCE_FILE_PATH = './datasets/insurance.csv'
insurance_df = pd.read_csv(INSURANCE_FILE_PATH)  # 데이터를 pandas dataframe으로 갖고 온다 (insurance_df.head()를 사용해서 데이터를 한 번 살펴보세요!)
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [22]:
# 필요한 열들에 One-hot Encoding을 해준다
insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])  
insurance_df

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,1,0,0,1,0,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0,0,0,1,0
2,28,33.000,3,4449.46200,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.880,0,3866.85520,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0,0,1,0,0
1334,18,31.920,0,2205.98080,1,0,1,0,1,0,0,0
1335,18,36.850,0,1629.83350,1,0,1,0,0,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0,0,0,0,1


In [23]:
X = insurance_df.drop(['charges'], axis=1)

In [24]:
# 4 차항 변형기를 정의
polynomial_transformer = PolynomialFeatures(4)  
#  4차 항 변수로 변환
polynomial_features = polynomial_transformer.fit_transform(X.values)
polynomial_features

array([[ 1.  , 19.  , 27.9 , ...,  0.  ,  0.  ,  1.  ],
       [ 1.  , 18.  , 33.77, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  , 28.  , 33.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 1.  , 18.  , 36.85, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  , 21.  , 25.8 , ...,  0.  ,  0.  ,  1.  ],
       [ 1.  , 61.  , 29.07, ...,  0.  ,  0.  ,  0.  ]])

In [25]:
 # 새로운 변수 이름들 생성
features = polynomial_transformer.get_feature_names(X.columns) 
features

['1',
 'age',
 'bmi',
 'children',
 'sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest',
 'age^2',
 'age bmi',
 'age children',
 'age sex_female',
 'age sex_male',
 'age smoker_no',
 'age smoker_yes',
 'age region_northeast',
 'age region_northwest',
 'age region_southeast',
 'age region_southwest',
 'bmi^2',
 'bmi children',
 'bmi sex_female',
 'bmi sex_male',
 'bmi smoker_no',
 'bmi smoker_yes',
 'bmi region_northeast',
 'bmi region_northwest',
 'bmi region_southeast',
 'bmi region_southwest',
 'children^2',
 'children sex_female',
 'children sex_male',
 'children smoker_no',
 'children smoker_yes',
 'children region_northeast',
 'children region_northwest',
 'children region_southeast',
 'children region_southwest',
 'sex_female^2',
 'sex_female sex_male',
 'sex_female smoker_no',
 'sex_female smoker_yes',
 'sex_female region_northeast',
 'sex_female region_northwest',
 'sex_female region_southeas

In [26]:
X = pd.DataFrame(polynomial_features, columns=features)  # 다항 입력 변수를 dataframe으로 만들어 준다
X

Unnamed: 0,1,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,...,region_northwest^2 region_southwest^2,region_northwest region_southeast^3,region_northwest region_southeast^2 region_southwest,region_northwest region_southeast region_southwest^2,region_northwest region_southwest^3,region_southeast^4,region_southeast^3 region_southwest,region_southeast^2 region_southwest^2,region_southeast region_southwest^3,region_southwest^4
0,1.0,19.0,27.900,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,18.0,33.770,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,28.0,33.000,3.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,33.0,22.705,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,32.0,28.880,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,1.0,50.0,30.970,3.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,1.0,18.0,31.920,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1335,1.0,18.0,36.850,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1336,1.0,21.0,25.800,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
y = insurance_df[['charges']]  # 목표 변수 정의
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [28]:
x_train, x_test, y_train, y_test=train_test_split(X,y,test_size=0.3, random_state=5)
model=Lasso(alpha=1, max_iter=2000, normalize=True)
model.fit(x_train,y_train)
y_train_predict=model.predict(x_train)
y_test_predict=model.predict(x_test)

In [29]:
mse = mean_squared_error(y_train, y_train_predict)

print("training set에서 성능")
print("-----------------------")
print(f'오차: {sqrt(mse)}')

mse = mean_squared_error(y_test, y_test_predict)

print("testing set에서 성능")
print("-----------------------")
print(f'오차: {sqrt(mse)}')


training set에서 성능
-----------------------
오차: 4726.636439607449
testing set에서 성능
-----------------------
오차: 4692.232442526968
