# 선형회귀

현대 중공업과 계약을 맺어 일부 선박에 대한 예측 모델을 구축하게됐습니다. 현대 중공업은 세계 최대의 선박 제조업체 중 하나로 유람선을 제작하고 있습니다.
당신은 선박에 필요한 선원 수를 정확하게 예측할 수 있도록 울산에있는 본사에 도착했습니다.
그들은 현재 새로운 선박을 건조하고 있으며 예측 모델을 만들고, 이를 사용하여 선박에 필요한 승무원 수를 예측하기를 원합니다.

지금까지의 데이터는 다음과 같습니다.

    Description: Measurements of ship size, capacity, crew, and age for 158 cruise
    ships.


    Variables/Columns
    Ship Name     1-20
    Cruise Line   21-40
    Age (as of 2013)   46-48
    Tonnage (1000s of tons)   50-56
    passengers (100s)   58-64
    Length (100s of feet)  66-72
    Cabins  (100s)   74-80
    Passenger Density   82-88
    Crew  (100s)   90-96
    
위 데이터는 "cruise_ship_info.csv"라는 csv 파일에 저장됩니다. 귀하의 임무는 향후 선박에 필요한 선원 수를 예측하는 데 도움이되는 회귀 모델을 만드는 것입니다. 고객은 또한 특정 크루즈 라인이 허용되는 승무원 수에 차이가 있음을 발견 했으므로 분석에 포함하는 것이 가장 중요한 기능이라고 언급했습니다!

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [3]:
df = pd.read_csv("./data/cruise_ship_info_example.csv")

In [4]:
df = df.drop(["Unnamed: 0", "Ship_name"], axis = 1)

In [5]:
df.head()

Unnamed: 0,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,Azamara,6,30.277,6.94,5.94,3.55,42.64,
1,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
2,Carnival,26,47.262,14.86,7.22,7.43,31.8,
3,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
4,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0


In [6]:
df.columns

Index(['Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins',
       'passenger_density', 'crew'],
      dtype='object')

In [7]:
scale_col = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']

In [8]:
df_ss = StandardScaler()
scaled_df = df_ss.fit_transform(df[scale_col])

In [9]:
scaled_df

array([[-1.27640208e+00, -1.10498441e+00, -1.19395611e+00,
        -1.22533080e+00, -1.18458832e+00,  3.18056583e-01],
       [-1.27640208e+00, -1.10498441e+00, -1.19395611e+00,
        -1.22533080e+00, -1.18458832e+00,  3.18056583e-01],
       [ 1.35810515e+00, -6.47310033e-01, -3.72926345e-01,
        -5.09362635e-01, -3.14095387e-01, -9.40676396e-01],
       [-6.17775272e-01,  1.04321543e+00,  1.16961443e+00,
         7.82736160e-01,  1.35734078e+00, -3.38017340e-01],
       [ 1.72576898e-01,  8.10215117e-01,  8.25445390e-01,
         4.41532582e-01,  9.82669853e-01, -1.78933929e-01],
       [ 8.31203706e-01, -2.47273742e-02,  2.13819677e-01,
         2.34573035e-01,  3.07364771e-01, -6.51539392e-01],
       [-9.08738254e-02, -2.47273742e-02,  2.13819677e-01,
         2.34573035e-01,  3.07364771e-01, -6.51539392e-01],
       [ 9.62929067e-01, -2.47273742e-02,  2.17966292e-01,
         2.34573035e-01,  3.11851848e-01, -6.58506549e-01],
       [ 4.36027621e-01, -2.47273742e-02,  2.138

In [10]:
scaled_df = pd.DataFrame(scaled_df, columns = scale_col)

In [11]:
scaled_df = pd.concat([scaled_df, df.iloc[:, 7].reset_index(drop = True)], axis = 1)

In [12]:
scaled_df[scaled_df["crew"].notnull()].corr()

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Age,1.0,-0.611074,-0.530998,-0.521201,-0.543746,-0.281846,-0.5547
Tonnage,-0.611074,1.0,0.949099,0.935228,0.959566,0.019446,0.933967
passengers,-0.530998,0.949099,1.0,0.90049,0.984567,-0.224288,0.920679
length,-0.521201,0.935228,0.90049,1.0,0.919576,-0.026614,0.918819
cabins,-0.543746,0.959566,0.984567,0.919576,1.0,-0.187582,0.95022
passenger_density,-0.281846,0.019446,-0.224288,-0.026614,-0.187582,1.0,-0.094822
crew,-0.5547,0.933967,0.920679,0.918819,0.95022,-0.094822,1.0


In [13]:
ohe = OneHotEncoder()

In [14]:
cruise_arr = np.array(df["Cruise_line"])

In [15]:
cruise_arr = np.reshape(cruise_arr, (-1, 1))

In [16]:
cruise_name= ohe.fit_transform(cruise_arr)

In [17]:
ohe_df = pd.DataFrame(cruise_name.toarray(), columns = ohe.get_feature_names_out())

In [18]:
cruise_df = pd.concat([scaled_df, ohe_df], axis = 1)

In [19]:
cr_nn_df = cruise_df[cruise_df["crew"].notnull()]

In [20]:
cr_nn_df

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew,x0_Azamara,x0_Carnival,x0_Celebrity,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
1,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,3.55,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.617775,1.043215,1.169614,0.782736,1.357341,-0.338017,19.10,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.172577,0.810215,0.825445,0.441533,0.982670,-0.178934,10.00,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.831204,-0.024727,0.213820,0.234573,0.307365,-0.651539,9.20,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-0.090874,-0.024727,0.213820,0.234573,0.307365,-0.651539,9.20,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.831204,-1.830796,-1.844975,-2.987284,-1.907008,1.244688,0.59,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
154,-0.222599,0.148615,0.118448,0.368817,0.188457,-0.083716,12.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
155,1.226380,-1.776662,-1.749603,-2.086730,-1.815023,-0.701471,0.88,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,1.489831,-1.776662,-1.740273,-2.086730,-1.815023,-0.912808,0.88,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
x = cr_nn_df.drop(["Age", "passenger_density", "crew"], axis = 1)
y = cr_nn_df["crew"]

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [23]:
lr = LinearRegression()

In [24]:
lr.fit(x_train, y_train)

In [25]:
lr.score(x_test, y_test)

0.9534946877075166

In [26]:
lr.intercept_

8.09092263822805

In [27]:
pred = lr.predict(x_test)

In [28]:
mse_pr = mean_squared_error(y_test, pred)
print(mse_pr)

0.48088212197730845


In [30]:
test_lr = LinearRegression()

In [31]:
test_lr.fit(x_train, y_train)

In [32]:
test_pred = test_lr.predict(cr_nn_df.drop(["Age", "passenger_density", "crew"], axis = 1))

In [33]:
test_pred

array([ 3.55      , 12.95168088, 11.56194745,  9.26707641,  9.26707641,
        9.27939876,  9.26707641, 12.93386049,  9.26707641, 10.1766506 ,
        9.26707641, 11.03411405,  9.26707641, 10.14677886, 11.43976743,
       11.92529046,  8.57969403, 10.00204892, 10.00204892,  9.25702625,
       10.00204892,  0.47559548,  6.6335864 ,  3.32667815,  9.36806397,
        6.62419065, 11.05103355, 11.05103355,  3.16155182,  5.6791326 ,
       11.40831572,  8.10885768,  6.36      ,  5.74781583,  9.21      ,
       11.2928281 ,  9.45      ,  8.99667501,  4.98593332,  3.62491757,
        5.39229726,  5.39229726,  6.2638129 ,  6.2638129 , 13.59482104,
        7.06236682, 11.14028514,  7.33763318,  2.86233113,  4.47328925,
       10.24056897,  7.40852866, 10.73940527, 10.26103324,  4.2893603 ,
        8.87322947,  9.66367046,  6.36820785,  3.53811444,  4.        ,
        4.        ,  4.        ,  3.5       ,  9.21836671,  5.10996885,
        8.64074827,  8.65160598,  8.21097014, 12.11005846, 11.21

In [34]:
test_df = pd.read_csv("./data/cruise_ship_info_example.csv")

In [35]:
test_df = test_df[test_df["crew"].notnull()]

In [36]:
test_df = test_df.reset_index(drop = True)

In [37]:
test_df = pd.concat([test_df, pd.DataFrame(test_pred, columns = ["예측"])], axis = 1)

In [38]:
test_df

Unnamed: 0.1,Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew,예측
0,1,Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55,3.550000
1,3,Conquest,Carnival,11,110.000,29.74,9.53,14.88,36.99,19.10,12.951681
2,4,Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.00,11.561947
3,5,Ecstasy,Carnival,22,70.367,20.52,8.55,10.20,34.29,9.20,9.267076
4,6,Elation,Carnival,15,70.367,20.52,8.55,10.20,34.29,9.20,9.267076
...,...,...,...,...,...,...,...,...,...,...,...
105,153,Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59,1.715650
106,154,Virgo,Star,14,76.800,19.60,8.79,9.67,39.18,12.00,10.874350
107,155,Spirit,Windstar,25,5.350,1.58,4.40,0.74,33.86,0.88,0.712167
108,156,Star,Windstar,27,5.350,1.67,4.40,0.74,32.04,0.88,0.702223


In [39]:
test_x = cruise_df[cruise_df["crew"].isna()]

In [40]:
test_x

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew,x0_Azamara,x0_Carnival,x0_Celebrity,...,x0_Oceania,x0_Orient,x0_P&O,x0_Princess,x0_Regent_Seven_Seas,x0_Royal_Caribbean,x0_Seabourn,x0_Silversea,x0_Star,x0_Windstar
0,-1.276402,-1.104984,-1.193956,-1.225331,-1.184588,0.318057,,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.358105,-0.64731,-0.372926,-0.509363,-0.314095,-0.940676,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-1.276402,1.049655,1.922225,0.771549,1.355097,-1.174076,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1.621556,-0.679914,-0.408173,-0.481395,-0.352236,-0.949966,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.304302,-0.024727,0.21382,0.234573,0.307365,-0.651539,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,-1.012951,1.043215,1.169614,0.771549,1.355097,-0.338017,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,-0.881226,0.463881,0.288459,0.838671,0.401593,0.205421,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,-0.881226,1.043215,1.169614,0.777143,1.355097,-0.338017,,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.172577,0.173216,0.045882,0.296102,0.116664,0.141555,,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,-0.354325,0.531245,0.193087,0.849858,0.206406,0.566552,,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
test_x = test_x.drop(["Age", "passenger_density", "crew"], axis = 1)

In [42]:
nan_pred = test_lr.predict(test_x)

In [43]:
nan_pred

array([ 3.55      ,  6.757192  , 12.13360887,  6.66622357,  9.26707641,
       12.93386049, 10.19700538, 12.93858514,  9.2481866 , 10.00204892,
        7.2609984 ,  5.50473456,  9.38534   ,  9.80528069,  9.44724803,
        6.06012559,  5.38613608,  8.53339572,  5.86688321,  5.39229726,
        6.45506305,  8.52190499,  7.20930938,  4.53402602,  7.20458472,
        8.88184029,  8.87322947,  7.40852866, 12.03835827, 11.16645239,
        2.9380116 ,  4.73248357,  3.07582744,  8.17114251,  4.9960881 ,
        7.80491927,  7.07763985,  8.4301187 , 12.40205492, 20.87731028,
        7.50901836,  8.17114251,  6.78941436, 12.40205492,  1.6       ,
        2.78395394,  1.72037475,  8.3025197 ])