# 회귀 실습

현대 중공업과 계약을 맺어 일부 선박에 대한 예측 모델을 구축하게됐습니다. 현대 중공업은 세계 최대의 선박 제조업체 중 하나로 유람선을 제작하고 있습니다.


당신은 선박에 필요한 선원 수를 정확하게 예측할 수 있도록 울산에있는 본사에 도착했습니다.


그들은 현재 새로운 선박을 건조하고 있으며 예측 모델을 만들고, 

이를 사용하여 **선박에 필요한 승무원 수를 예측** 하기를 원합니다.
    
데이터는 "cruise_ship_info.csv"라는 csv 파일에 저장됩니다. 


귀하의 임무는 향후 선박에 필요한 선원 수를 예측하는 데 도움이되는 회귀 모델을 만드는 것입니다. 

고객은 또한 특정 **크루즈 라인** 이 허용되는 승무원 수에 차이가 있음을 발견 했으므로
분석에 포함하는 것이 가장 중요한 기능이라고 언급했습니다!

# 선사라는 독립변수를 꼭...사용해라~~~ Cruise_line

# 목표설정 : 선박에 필요한 승무원수 

In [67]:
# 독립변수를 늘렸을때 어떻게 되는지 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error # 제곱해서 평균낸 잔차


from statsmodels.stats.outliers_influence import variance_inflation_factor # 다중공선성체크 (분산팽창요인이용)
import statsmodels.api as sm # statsmodels = 통계모델 의 여러가지 통계치를 보고 독립변수를 체크하려고씀

In [68]:
df = pd.read_csv("./data/cruise_ship_info.csv")

In [69]:
df.shape

(158, 9)

In [70]:
df.head()

Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
1,Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
2,Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
3,Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
4,Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0


In [71]:
df.dtypes

Ship_name             object
Cruise_line           object
Age                    int64
Tonnage              float64
passengers           float64
length               float64
cabins               float64
passenger_density    float64
crew                 float64
dtype: object

In [72]:
# 일단 종속변수들을 뭐로할지 생각해보자고
# Age	Tonnage	passengers	length	cabins	passenger_density

In [73]:
# df["Cruise_line"] => 이거 범주화를 숫자로 변경하는거 필요함
ohe = OneHotEncoder()
arr = np.array(df["Cruise_line"])
arr = np.reshape(arr, (-1,1))

cruise_line_name = ohe.fit_transform(arr)
ohe.get_feature_names_out() # 확인해보기

array(['x0_Azamara', 'x0_Carnival', 'x0_Celebrity', 'x0_Costa',
       'x0_Crystal', 'x0_Cunard', 'x0_Disney', 'x0_Holland_American',
       'x0_MSC', 'x0_Norwegian', 'x0_Oceania', 'x0_Orient', 'x0_P&O',
       'x0_Princess', 'x0_Regent_Seven_Seas', 'x0_Royal_Caribbean',
       'x0_Seabourn', 'x0_Silversea', 'x0_Star', 'x0_Windstar'],
      dtype=object)

In [74]:
ohe_df = pd.DataFrame(
    cruise_line_name.toarray(),
    columns = ohe.get_feature_names_out()
)

In [75]:
ohe_df

Unnamed: 0,x0_Azamara,x0_Carnival,x0_Celebrity,x0_Costa,x0_Crystal,x0_Cunard,x0_Disney,x0_Holland_American,x0_MSC,x0_Norwegian,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [76]:
# concat 해서 합치셈

ohe_df = pd.concat([
    df,
    ohe_df
], axis = 1)

In [77]:
ohe_df

Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew,x0_Azamara,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.80,6.70,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Conquest,Carnival,11,110.000,29.74,9.53,14.88,36.99,19.10,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
154,Virgo,Star,14,76.800,19.60,8.79,9.67,39.18,12.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
155,Spirit,Windstar,25,5.350,1.58,4.40,0.74,33.86,0.88,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,Star,Windstar,27,5.350,1.67,4.40,0.74,32.04,0.88,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [78]:
# 필요없는 열 삭제

ohe_df = ohe_df.drop(["Ship_name","Cruise_line"], axis = 1)
ohe_df.head()

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew,x0_Azamara,x0_Carnival,x0_Celebrity,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,6,30.277,6.94,5.94,3.55,42.64,3.55,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,30.277,6.94,5.94,3.55,42.64,3.55,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26,47.262,14.86,7.22,7.43,31.8,6.7,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11,110.0,29.74,9.53,14.88,36.99,19.1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,101.353,26.42,8.92,13.21,38.36,10.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
ohe_df.dtypes

Age                       int64
Tonnage                 float64
passengers              float64
length                  float64
cabins                  float64
passenger_density       float64
crew                    float64
x0_Azamara              float64
x0_Carnival             float64
x0_Celebrity            float64
x0_Costa                float64
x0_Crystal              float64
x0_Cunard               float64
x0_Disney               float64
x0_Holland_American     float64
x0_MSC                  float64
x0_Norwegian            float64
x0_Oceania              float64
x0_Orient               float64
x0_P&O                  float64
x0_Princess             float64
x0_Regent_Seven_Seas    float64
x0_Royal_Caribbean      float64
x0_Seabourn             float64
x0_Silversea            float64
x0_Star                 float64
x0_Windstar             float64
dtype: object

In [80]:
ohe_df.isna().sum()

Age                     0
Tonnage                 0
passengers              0
length                  0
cabins                  0
passenger_density       0
crew                    0
x0_Azamara              0
x0_Carnival             0
x0_Celebrity            0
x0_Costa                0
x0_Crystal              0
x0_Cunard               0
x0_Disney               0
x0_Holland_American     0
x0_MSC                  0
x0_Norwegian            0
x0_Oceania              0
x0_Orient               0
x0_P&O                  0
x0_Princess             0
x0_Regent_Seven_Seas    0
x0_Royal_Caribbean      0
x0_Seabourn             0
x0_Silversea            0
x0_Star                 0
x0_Windstar             0
dtype: int64

In [81]:
# Cruise_line 범주화 해야함

x = ohe_df.drop("crew", axis = 1) # 일단 다 넣어
y = df["crew"]

In [82]:
# 데이터 분할해 
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size = 0.25,
    random_state = 666
)

In [83]:
# 다항회귀 이용해서 예측해야하니까 일단하셈

In [84]:
pf = PolynomialFeatures(include_bias = False)
poly_train = pf.fit_transform(x_train)
poly_test = pf.fit_transform(x_test)

In [85]:
lr_org = LinearRegression()
lr_org.fit(x_train, y_train)
print(lr_org.score(x_train,y_train))
print(lr_org.score(x_test,y_test))

0.9684915885826058
0.8864981618243484


In [86]:
y_pred = lr_org.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mse

1.735439807058541

# 단위 평준화를 시도

In [None]:
# 단위 평준화를 했을때는 어떻게 해야지?

In [105]:
df.columns

Index(['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length',
       'cabins', 'passenger_density', 'crew'],
      dtype='object')

In [106]:
col_list = df.columns[2:].to_list()

In [108]:
ss = StandardScaler()
scaled_df = ss.fit_transform(df[col_list])

In [110]:
# 데이터 프레임으로 만들어야지~~
scaled_df = pd.DataFrame(scaled_df, columns = col_list)

In [111]:
scaled_df # 평준화 완료~~~

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,-1.215267
1,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,-1.215267
2,1.358105,-0.647310,-0.372926,-0.509363,-0.314095,-0.940676,-0.313304
3,-0.617775,1.043215,1.169614,0.782736,1.357341,-0.338017,3.237281
4,0.172577,0.810215,0.825445,0.441533,0.982670,-0.178934,0.631610
...,...,...,...,...,...,...,...
153,0.831204,-1.830796,-1.844975,-2.987284,-1.907008,1.244688,-2.062826
154,-0.222599,0.148615,0.118448,0.368817,0.188457,-0.083716,1.204285
155,1.226380,-1.776662,-1.749603,-2.086730,-1.815023,-0.701471,-1.979788
156,1.489831,-1.776662,-1.740273,-2.086730,-1.815023,-0.912808,-1.979788


# 최적화

In [87]:
# degree 로 최적화 해보자
pf = PolynomialFeatures(degree = 1, include_bias = False)
poly_train5 = pf.fit_transform(x_train)
poly_test5 = pf.transform(x_test)
poly_train5.shape


lr_poly5 = LinearRegression()
lr_poly5.fit(poly_train5, y_train)

print(lr_poly5.score(poly_train5, y_train))
print(lr_poly5.score(poly_test5, y_test))

0.9684915885826058
0.8864981618243484


In [101]:
y_pred = lr_org.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mse

# 잔차가 1.7

1.735439807058541

# 리포트 찍어봐야징~!~~~

In [89]:
# 독립변수를 좀 변환할까?

In [90]:
x = sm.add_constant(x) # 절편 추가
model = sm.OLS(y,x)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.937
Method:,Least Squares,F-statistic:,95.15
Date:,"Wed, 19 Nov 2025",Prob (F-statistic):,3.47e-72
Time:,16:45:41,Log-Likelihood:,-189.09
No. Observations:,158,AIC:,430.2
Df Residuals:,132,BIC:,509.8
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.6608,1.131,-1.469,0.144,-3.898,0.576
Age,0.0073,0.016,0.462,0.645,-0.024,0.038
Tonnage,0.0146,0.012,1.177,0.241,-0.010,0.039
passengers,-0.0979,0.050,-1.951,0.053,-0.197,0.001
length,0.4851,0.122,3.969,0.000,0.243,0.727
cabins,0.7249,0.092,7.842,0.000,0.542,0.908
passenger_density,0.0007,0.021,0.035,0.972,-0.041,0.042
x0_Azamara,-0.0808,0.641,-0.126,0.900,-1.350,1.188
x0_Carnival,0.0270,0.274,0.098,0.922,-0.515,0.569

0,1,2,3
Omnibus:,174.008,Durbin-Watson:,2.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6004.955
Skew:,4.012,Prob(JB):,0.0
Kurtosis:,32.117,Cond. No.,8240000000000000.0


In [91]:
# Age 라는 독립변수가 유의확률이 너무 높은디? 삭제해야겠다
# 그전에 다중공선성도 체크

In [92]:
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["feature"] = x.columns
vif.round(1)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF_Factor,feature
0,0.0,const
1,2.9,Age
2,43.5,Tonnage
3,48.2,passengers
4,9.8,length
5,34.9,cabins
6,6.8,passenger_density
7,inf,x0_Azamara
8,inf,x0_Carnival
9,inf,x0_Celebrity


In [93]:
# "Tonnage" , "passengers", "cabins" 이 변수들이 다중공선성이 높네

# AGE 변수 제거

In [94]:
new_x = x.drop("Age", axis = 1)
new_x

Unnamed: 0,const,Tonnage,passengers,length,cabins,passenger_density,x0_Azamara,x0_Carnival,x0_Celebrity,x0_Costa,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,1.0,30.277,6.94,5.94,3.55,42.64,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,30.277,6.94,5.94,3.55,42.64,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,47.262,14.86,7.22,7.43,31.80,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,110.000,29.74,9.53,14.88,36.99,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,101.353,26.42,8.92,13.21,38.36,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1.0,3.341,0.66,2.79,0.33,50.62,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
154,1.0,76.800,19.60,8.79,9.67,39.18,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
155,1.0,5.350,1.58,4.40,0.74,33.86,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,1.0,5.350,1.67,4.40,0.74,32.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [95]:
model = sm.OLS(y, new_x)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.938
Method:,Least Squares,F-statistic:,99.7
Date:,"Wed, 19 Nov 2025",Prob (F-statistic):,3.81e-73
Time:,16:45:43,Log-Likelihood:,-189.21
No. Observations:,158,AIC:,428.4
Df Residuals:,133,BIC:,505.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.3650,0.930,-1.468,0.144,-3.204,0.474
Tonnage,0.0139,0.012,1.132,0.260,-0.010,0.038
passengers,-0.1019,0.049,-2.066,0.041,-0.199,-0.004
length,0.4823,0.122,3.963,0.000,0.242,0.723
cabins,0.7307,0.091,8.004,0.000,0.550,0.911
passenger_density,-0.0018,0.020,-0.088,0.930,-0.042,0.038
x0_Azamara,-0.1811,0.602,-0.301,0.764,-1.372,1.009
x0_Carnival,0.0419,0.271,0.154,0.877,-0.495,0.579
x0_Celebrity,0.3298,0.306,1.078,0.283,-0.275,0.935

0,1,2,3
Omnibus:,173.48,Durbin-Watson:,2.248
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5949.978
Skew:,3.994,Prob(JB):,0.0
Kurtosis:,31.983,Cond. No.,1.07e+16


In [96]:
# 피처마다 VIF 계수출력

vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(new_x.values, i) for i in range(new_x.shape[1])]
vif["feature"] = new_x.columns
vif.round(1)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF_Factor,feature
0,0.0,const
1,42.9,Tonnage
2,46.8,passengers
3,9.8,length
4,34.3,cabins
5,6.3,passenger_density
6,inf,x0_Azamara
7,inf,x0_Carnival
8,inf,x0_Celebrity
9,inf,x0_Costa


In [98]:
y_pred = lr_org.predict(new_x)

mse = mean_squared_error(y_test, y_pred)
mse

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- const
Feature names seen at fit time, yet now missing:
- Age


In [64]:
new_x = x.drop(["Tonnage","passengers","cabins"], axis = 1)

In [65]:
model = sm.OLS(y, new_x)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.86
Model:,OLS,Adj. R-squared:,0.838
Method:,Least Squares,F-statistic:,37.84
Date:,"Wed, 19 Nov 2025",Prob (F-statistic):,4.96e-47
Time:,16:44:08,Log-Likelihood:,-266.2
No. Observations:,158,AIC:,578.4
Df Residuals:,135,BIC:,648.8
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.8975,1.480,-2.633,0.009,-6.825,-0.970
Age,-0.0511,0.023,-2.209,0.029,-0.097,-0.005
length,1.6750,0.127,13.200,0.000,1.424,1.926
passenger_density,-0.0260,0.022,-1.164,0.246,-0.070,0.018
x0_Azamara,-1.0887,1.022,-1.066,0.288,-3.109,0.932
x0_Carnival,0.8414,0.412,2.043,0.043,0.027,1.656
x0_Celebrity,-0.4049,0.477,-0.850,0.397,-1.347,0.538
x0_Costa,-0.2392,0.461,-0.519,0.605,-1.151,0.672
x0_Crystal,-1.1183,1.035,-1.081,0.282,-3.165,0.928

0,1,2,3
Omnibus:,95.086,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1114.322
Skew:,1.881,Prob(JB):,1.0699999999999999e-242
Kurtosis:,15.454,Cond. No.,6.62e+17


In [None]:
# 그 뭐냐 그거 Cruise_line 불린으로 변환시킨거랑 concat 시키고 싶음
scaled_df

In [113]:
one_hot = pd.DataFrame(
    cruise_line_name.toarray(),
    columns = ohe.get_feature_names_out()
)

In [115]:
merge_df = pd.concat([scaled_df, one_hot], axis=1)
merge_df

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew,x0_Azamara,x0_Carnival,x0_Celebrity,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,-1.215267,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,-1.215267,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.358105,-0.647310,-0.372926,-0.509363,-0.314095,-0.940676,-0.313304,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.617775,1.043215,1.169614,0.782736,1.357341,-0.338017,3.237281,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.172577,0.810215,0.825445,0.441533,0.982670,-0.178934,0.631610,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.831204,-1.830796,-1.844975,-2.987284,-1.907008,1.244688,-2.062826,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
154,-0.222599,0.148615,0.118448,0.368817,0.188457,-0.083716,1.204285,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
155,1.226380,-1.776662,-1.749603,-2.086730,-1.815023,-0.701471,-1.979788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,1.489831,-1.776662,-1.740273,-2.086730,-1.815023,-0.912808,-1.979788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [116]:
# 데이터 분할해 
x_train, x_test, y_train, y_test = train_test_split(
    merge_df.drop("crew", axis = 1),
    y,
    test_size = 0.25,
    random_state = 666
)

y_pred = lr_org.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mse


103.71813056655144