# Palmer Archipelago (Antarctica) penguin - Linear Regression

### The dataset contains data for 344 penguins. There are 3 different species of penguins in this dataset, <br>collected from 3 islands in the Palmer Archipelago, Antarctica.

![image.png](../Images/Penguins.png)

해당 데이터는 2014년 남극 팽귄 공동체 내 생태학적 성적 이형성과 환경적 변동성에 관한 연구 데이터이며, 다양한 종의 펭귄들의 서식지와 주요 신체 크기와 무게에 대한 7가지 정도의 데이터로 이루어져 있다. Palmer's penguin이라는 데이터 세트로도 알려져 있으며 시각화 패키지중 하나인 seaborn 패키지에 예제 데이터 세트로 존재한다.

주어진 학습용 데이터(penguin_X_train.csv, penguin_X_test.csv)를 활용하여 해당 자동차 해당 펭귄의 무게를 예측하는 회귀 모형을 만든 후,<br> 이를 평가용 데이터(penguin_X_test.csv)에 적용하여 얻은 무게 예측값을 .csv 파일로 저장한다.

### Library & Data Import

In [2]:
import pandas as pd
import numpy as np

In [3]:
X_test = pd.read_csv('../Datasets/Penguin_X_test.csv')
X_train = pd.read_csv('../Datasets/Penguin_X_train.csv')
y_train = pd.read_csv('../Datasets/Penguin_y_train.csv')

### 1. 데이터 탐색

In [4]:
X_test

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,MALE,42.1,19.1,195.0
1,Gentoo,Biscoe,MALE,45.5,15.0,220.0
2,Adelie,Biscoe,MALE,40.6,18.8,193.0
3,Adelie,Dream,FEMALE,39.5,17.8,188.0
4,Gentoo,Biscoe,FEMALE,45.1,14.5,207.0
...,...,...,...,...,...,...
96,Gentoo,Biscoe,MALE,45.2,15.8,215.0
97,Adelie,Torgersen,FEMALE,40.3,18.0,195.0
98,Gentoo,Biscoe,FEMALE,46.5,14.5,213.0
99,Gentoo,Biscoe,MALE,49.6,15.0,216.0


In [5]:
X_train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,,42.0,20.2,190.0
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0
...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0


In [6]:
y_train

Unnamed: 0,body_mass_g
0,4250.0
1,4650.0
2,4250.0
3,4500.0
4,5700.0
...,...
235,3800.0
236,5950.0
237,3200.0
238,3575.0


In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB


In [8]:
train =  pd.concat([X_train, y_train], axis=1)
print(train.loc[(train.sex.isna()) | (train.bill_length_mm.isna()) | (train.bill_depth_mm.isna()) | (train.flipper_length_mm.isna()) | (train.body_mass_g.isna())])

    species     island  sex  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen  NaN            42.0           20.2              190.0   
6    Gentoo     Biscoe  NaN            44.5           14.3              216.0   
43   Gentoo     Biscoe  NaN             NaN            NaN                NaN   
66   Adelie  Torgersen  NaN            37.8           17.3              180.0   
88   Gentoo     Biscoe  NaN            47.3           13.8              216.0   
89   Adelie  Torgersen  NaN            37.8           17.1              186.0   
110  Gentoo     Biscoe  NaN            44.5           15.7              217.0   
229  Adelie  Torgersen  NaN             NaN            NaN                NaN   

     body_mass_g  
0         4250.0  
6         4100.0  
43           NaN  
66        3700.0  
88        4725.0  
89        3300.0  
110       4875.0  
229          NaN  


In [9]:
train = train.dropna()
train.reset_index(drop=True, inplace=True)

In [10]:
X_train = train[['species','island', 'sex','bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']]
y_train = train[['body_mass_g']]

In [11]:
X_train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm
count,232.0,232.0,232.0
mean,43.990948,17.226293,200.681034
std,5.50976,1.964677,14.064231
min,32.1,13.2,172.0
25%,39.2,15.7,190.0
50%,44.95,17.35,197.0
75%,48.775,18.725,212.25
max,58.0,21.5,231.0


In [12]:
COL_DEL = []
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']
COL_CAT = ['species','island', 'sex']
COL_Y = ['body_mass_g']

In [13]:
X = pd.concat([X_train, X_test])

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe.fit(X[COL_CAT])

X_train_res = ohe.transform(X_train[COL_CAT])
X_test_res = ohe.transform(X_test[COL_CAT])

In [14]:
print(X_train_res)

  (0, 2)	1.0
  (0, 3)	1.0
  (0, 6)	1.0
  (1, 0)	1.0
  (1, 5)	1.0
  (1, 7)	1.0
  (2, 1)	1.0
  (2, 4)	1.0
  (2, 7)	1.0
  (3, 2)	1.0
  (3, 3)	1.0
  (3, 7)	1.0
  (4, 0)	1.0
  (4, 4)	1.0
  (4, 6)	1.0
  (5, 2)	1.0
  (5, 3)	1.0
  (5, 7)	1.0
  (6, 2)	1.0
  (6, 3)	1.0
  (6, 6)	1.0
  (7, 0)	1.0
  (7, 5)	1.0
  (7, 6)	1.0
  (8, 2)	1.0
  :	:
  (223, 6)	1.0
  (224, 2)	1.0
  (224, 3)	1.0
  (224, 7)	1.0
  (225, 0)	1.0
  (225, 4)	1.0
  (225, 7)	1.0
  (226, 1)	1.0
  (226, 4)	1.0
  (226, 7)	1.0
  (227, 1)	1.0
  (227, 4)	1.0
  (227, 6)	1.0
  (228, 2)	1.0
  (228, 3)	1.0
  (228, 7)	1.0
  (229, 0)	1.0
  (229, 5)	1.0
  (229, 6)	1.0
  (230, 1)	1.0
  (230, 4)	1.0
  (230, 6)	1.0
  (231, 2)	1.0
  (231, 3)	1.0
  (231, 6)	1.0


In [15]:
X_train_ohe = pd.DataFrame(X_train_res.todense(), columns = ohe.get_feature_names())
X_test_ohe = pd.DataFrame(X_test_res.todense(), columns = ohe.get_feature_names())

print(X_train_ohe)

X_train_fin = pd.concat([X_train[COL_NUM], X_train_ohe], axis=1)
X_test_fin = pd.concat([X_test[COL_NUM], X_test_ohe], axis=1)

     x0_Adelie  x0_Chinstrap  x0_Gentoo  x1_Biscoe  x1_Dream  x1_Torgersen  \
0          0.0           0.0        1.0        1.0       0.0           0.0   
1          1.0           0.0        0.0        0.0       0.0           1.0   
2          0.0           1.0        0.0        0.0       1.0           0.0   
3          0.0           0.0        1.0        1.0       0.0           0.0   
4          1.0           0.0        0.0        0.0       1.0           0.0   
..         ...           ...        ...        ...       ...           ...   
227        0.0           1.0        0.0        0.0       1.0           0.0   
228        0.0           0.0        1.0        1.0       0.0           0.0   
229        1.0           0.0        0.0        0.0       0.0           1.0   
230        0.0           1.0        0.0        0.0       1.0           0.0   
231        0.0           0.0        1.0        1.0       0.0           0.0   

     x2_FEMALE  x2_MALE  
0          1.0      0.0  
1          



In [16]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train_fin, y_train, test_size=0.3)

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_tr[COL_NUM])
X_tr[COL_NUM]=scaler.transform(X_tr[COL_NUM])
X_val[COL_NUM]=scaler.transform(X_val[COL_NUM])
X_test_fin[COL_NUM]=scaler.transform(X_test_fin[COL_NUM])

In [18]:
from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()
modelLR.fit(X_tr, y_tr)

y_val_pred = modelLR.predict(X_val)
print(y_val_pred)

[[3941.76741535]
 [5546.37801452]
 [5699.02567325]
 [4979.46050986]
 [4315.56050513]
 [4336.99355525]
 [4695.79109849]
 [3679.69523088]
 [4012.41811437]
 [3430.03666582]
 [3380.98994972]
 [5434.77191115]
 [4788.30382194]
 [5294.60519926]
 [3818.50672172]
 [3482.43612282]
 [4151.60241927]
 [3345.52127709]
 [5249.42419491]
 [4647.04394332]
 [4825.93424868]
 [3401.75185223]
 [4039.91812019]
 [3971.59318098]
 [4081.99271294]
 [3482.97770282]
 [3548.6370137 ]
 [3629.99227765]
 [4925.52805611]
 [3458.04668459]
 [3421.39417706]
 [5196.76385348]
 [3690.84773538]
 [3340.08333294]
 [5455.57043246]
 [3882.73073105]
 [4807.04542317]
 [4121.35940716]
 [4617.41883158]
 [5499.26866074]
 [5512.18737991]
 [3585.01439461]
 [3987.74408318]
 [3616.89766758]
 [3618.10939509]
 [3957.45703469]
 [4136.88476405]
 [4343.83836352]
 [4252.07975603]
 [4713.69056182]
 [4127.37701613]
 [4335.97942696]
 [4184.00497354]
 [4823.51554004]
 [5266.25610035]
 [3333.48134273]
 [3275.03889402]
 [4842.87385918]
 [4769.6868908

In [19]:
print(modelLR.intercept_)

coef = pd.Series(data=modelLR.coef_[0], index= X_train_fin.columns)
print(coef.sort_values())

[3311.68112137]
x0_Chinstrap         -347.696587
x0_Adelie            -225.133861
x2_FEMALE            -207.835264
x1_Dream              -82.521531
x1_Torgersen          -15.331894
x1_Biscoe              97.853425
x2_MALE               207.835264
bill_length_mm        383.153305
bill_depth_mm         389.933505
x0_Gentoo             572.830448
flipper_length_mm    1046.151034
dtype: float64


In [20]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_val_pred)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)

print('MSE : {0:.3f} , RMSE : {1:.3F} '.format(mse , rmse))

MSE : 97284.682 , RMSE : 311.905 


In [21]:
y_pred = modelLR.predict(X_test_fin)
print(y_pred)

[[4120.79840263]
 [5341.8292133 ]
 [4158.30290368]
 [3407.46294635]
 [4665.19443548]
 [4016.23242298]
 [3391.04158767]
 [3779.01545533]
 [3573.90215396]
 [3475.42509689]
 [3343.92995011]
 [5495.51660915]
 [3675.27998696]
 [4102.03873503]
 [3801.51902006]
 [4143.8837518 ]
 [4415.30849204]
 [3563.95152135]
 [3598.9171375 ]
 [3371.26024932]
 [5327.49115755]
 [4182.86168735]
 [5858.66926506]
 [4690.45857563]
 [5376.4103031 ]
 [5506.65420316]
 [5509.6609662 ]
 [4211.30458094]
 [3458.26514939]
 [4011.80326867]
 [4134.36022255]
 [3386.66747971]
 [3572.69432698]
 [4621.48619733]
 [3314.21884474]
 [3486.25271521]
 [3355.09418117]
 [3386.73404882]
 [3934.1140333 ]
 [4967.21193821]
 [3456.12716256]
 [5598.48592338]
 [4951.70947521]
 [4702.95817611]
 [5758.76709868]
 [3803.51327156]
 [4771.57572105]
 [3383.26823502]
 [5403.42300439]
 [4289.98005072]
 [4678.58264826]
 [4728.36174263]
 [4224.58077842]
 [3872.59048263]
 [3225.59454145]
 [4369.92318874]
 [4853.74261345]
 [4617.63034119]
 [3896.4154207

In [22]:
pd.DataFrame({'body_mass_g': y_pred[:,0]}).to_csv('./result.csv', index=False)