# Palmer Archipelago (Antarctica) penguin

### The dataset contains data for 344 penguins. There are 3 different species of penguins in this dataset, <br>collected from 3 islands in the Palmer Archipelago, Antarctica.

![image.png](../Images/Penguins.png)

해당 데이터는 2014년 남극 팽귄 공동체 내 생태학적 성적 이형성과 환경적 변동성에 관한 연구 데이터이며, 다양한 종의 펭귄들의 서식지와 주요 신체 크기와 무게에 대한 7가지 정도의 데이터로 이루어져 있다. Palmer's penguin이라는 데이터 세트로도 알려져 있으며 시각화 패키지중 하나인 seaborn 패키지에 예제 데이터 세트로 존재한다.

주어진 학습용 데이터(penguin_X_train.csv, penguin_X_test.csv)를 활용하여 해당 자동차 해당 펭귄의 무게를 예측하는 회귀 모형을 만든 후,<br> 이를 평가용 데이터(penguin_X_test.csv)에 적용하여 얻은 무게 예측값을 .csv 파일로 저장한다.

### Library & Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
X_test = pd.read_csv('../Datasets/Penguin_X_test.csv')
X_train = pd.read_csv('../Datasets/Penguin_X_train.csv')
y_train = pd.read_csv('../Datasets/Penguin_y_train.csv')

### 1. Data Exploration

In [3]:
X_test

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,MALE,42.1,19.1,195.0
1,Gentoo,Biscoe,MALE,45.5,15.0,220.0
2,Adelie,Biscoe,MALE,40.6,18.8,193.0
3,Adelie,Dream,FEMALE,39.5,17.8,188.0
4,Gentoo,Biscoe,FEMALE,45.1,14.5,207.0
...,...,...,...,...,...,...
96,Gentoo,Biscoe,MALE,45.2,15.8,215.0
97,Adelie,Torgersen,FEMALE,40.3,18.0,195.0
98,Gentoo,Biscoe,FEMALE,46.5,14.5,213.0
99,Gentoo,Biscoe,MALE,49.6,15.0,216.0


In [4]:
X_train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,,42.0,20.2,190.0
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0
...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0


In [5]:
y_train

Unnamed: 0,body_mass_g
0,4250.0
1,4650.0
2,4250.0
3,4500.0
4,5700.0
...,...
235,3800.0
236,5950.0
237,3200.0
238,3575.0


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB


### 2. Data Preprocessing

#### (1) Missing Value

In [7]:
train =  pd.concat([X_train, y_train], axis=1)
print(train.loc[(train.sex.isna()) | (train.bill_length_mm.isna()) | (train.bill_depth_mm.isna()) | (train.flipper_length_mm.isna()) | (train.body_mass_g.isna())])

    species     island  sex  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen  NaN            42.0           20.2              190.0   
6    Gentoo     Biscoe  NaN            44.5           14.3              216.0   
43   Gentoo     Biscoe  NaN             NaN            NaN                NaN   
66   Adelie  Torgersen  NaN            37.8           17.3              180.0   
88   Gentoo     Biscoe  NaN            47.3           13.8              216.0   
89   Adelie  Torgersen  NaN            37.8           17.1              186.0   
110  Gentoo     Biscoe  NaN            44.5           15.7              217.0   
229  Adelie  Torgersen  NaN             NaN            NaN                NaN   

     body_mass_g  
0         4250.0  
6         4100.0  
43           NaN  
66        3700.0  
88        4725.0  
89        3300.0  
110       4875.0  
229          NaN  


In [8]:
train = train.dropna()
train.reset_index(drop=True, inplace=True)

### 3. Data Modeling

#### (1) Data Split

In [9]:
X_train = train[['species','island', 'sex','bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']]
y_train = train[['body_mass_g']]

In [10]:
X_train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm
count,232.0,232.0,232.0
mean,43.990948,17.226293,200.681034
std,5.50976,1.964677,14.064231
min,32.1,13.2,172.0
25%,39.2,15.7,190.0
50%,44.95,17.35,197.0
75%,48.775,18.725,212.25
max,58.0,21.5,231.0


In [11]:
COL_DEL = []
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']
COL_CAT = ['species','island', 'sex']
COL_Y = ['body_mass_g']

#### (2) One-Hot Encoding

In [12]:
X = pd.concat([X_train, X_test])

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe.fit(X[COL_CAT])

X_train_res = ohe.transform(X_train[COL_CAT])
X_test_res = ohe.transform(X_test[COL_CAT])

In [13]:
print(X_train_res)

  (0, 2)	1.0
  (0, 3)	1.0
  (0, 6)	1.0
  (1, 0)	1.0
  (1, 5)	1.0
  (1, 7)	1.0
  (2, 1)	1.0
  (2, 4)	1.0
  (2, 7)	1.0
  (3, 2)	1.0
  (3, 3)	1.0
  (3, 7)	1.0
  (4, 0)	1.0
  (4, 4)	1.0
  (4, 6)	1.0
  (5, 2)	1.0
  (5, 3)	1.0
  (5, 7)	1.0
  (6, 2)	1.0
  (6, 3)	1.0
  (6, 6)	1.0
  (7, 0)	1.0
  (7, 5)	1.0
  (7, 6)	1.0
  (8, 2)	1.0
  :	:
  (223, 6)	1.0
  (224, 2)	1.0
  (224, 3)	1.0
  (224, 7)	1.0
  (225, 0)	1.0
  (225, 4)	1.0
  (225, 7)	1.0
  (226, 1)	1.0
  (226, 4)	1.0
  (226, 7)	1.0
  (227, 1)	1.0
  (227, 4)	1.0
  (227, 6)	1.0
  (228, 2)	1.0
  (228, 3)	1.0
  (228, 7)	1.0
  (229, 0)	1.0
  (229, 5)	1.0
  (229, 6)	1.0
  (230, 1)	1.0
  (230, 4)	1.0
  (230, 6)	1.0
  (231, 2)	1.0
  (231, 3)	1.0
  (231, 6)	1.0


In [14]:
X_train_ohe = pd.DataFrame(X_train_res.todense(), columns = ohe.get_feature_names_out())
X_test_ohe = pd.DataFrame(X_test_res.todense(), columns = ohe.get_feature_names_out())

print(X_train_ohe)

X_train_fin = pd.concat([X_train[COL_NUM], X_train_ohe], axis=1)
X_test_fin = pd.concat([X_test[COL_NUM], X_test_ohe], axis=1)

     species_Adelie  species_Chinstrap  species_Gentoo  island_Biscoe  \
0               0.0                0.0             1.0            1.0   
1               1.0                0.0             0.0            0.0   
2               0.0                1.0             0.0            0.0   
3               0.0                0.0             1.0            1.0   
4               1.0                0.0             0.0            0.0   
..              ...                ...             ...            ...   
227             0.0                1.0             0.0            0.0   
228             0.0                0.0             1.0            1.0   
229             1.0                0.0             0.0            0.0   
230             0.0                1.0             0.0            0.0   
231             0.0                0.0             1.0            1.0   

     island_Dream  island_Torgersen  sex_FEMALE  sex_MALE  
0             0.0               0.0         1.0       0.0  
1  

In [15]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train_fin, y_train, test_size=0.3)

#### (3) Scaling

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_tr[COL_NUM])
X_tr[COL_NUM]=scaler.transform(X_tr[COL_NUM])
X_val[COL_NUM]=scaler.transform(X_val[COL_NUM])
X_test_fin[COL_NUM]=scaler.transform(X_test_fin[COL_NUM])

### 4. Modeling

In [17]:
from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()
modelLR.fit(X_tr, y_tr)

y_val_pred = modelLR.predict(X_val)
print(y_val_pred)

[[4785.63962672]
 [5698.35608272]
 [5413.39871408]
 [3233.60163035]
 [3320.82185126]
 [4326.81846307]
 [3450.13116872]
 [3515.08200627]
 [3925.03418115]
 [4986.02952388]
 [4075.30929802]
 [5538.12018124]
 [4261.4174491 ]
 [4626.99478054]
 [4262.11942223]
 [4970.43749824]
 [5299.45636969]
 [5599.42117531]
 [4015.88768166]
 [4205.85466059]
 [3923.01765944]
 [4735.22933274]
 [4184.22876261]
 [4719.47982588]
 [5443.66287704]
 [4195.36893661]
 [4345.90483   ]
 [4611.30610042]
 [5268.61303243]
 [5293.53045443]
 [4189.48351088]
 [5397.98992124]
 [5211.03189628]
 [3686.75270698]
 [4294.3637333 ]
 [3394.33994709]
 [4266.47511509]
 [5165.25202495]
 [5386.04125136]
 [4040.51485124]
 [4096.94423061]
 [4166.53741027]
 [3408.74271507]
 [3911.09984469]
 [3753.24737721]
 [5390.04099655]
 [5366.57654837]
 [4637.64702035]
 [3403.64544801]
 [3381.18236292]
 [5474.67937094]
 [3711.24184409]
 [3483.28643729]
 [5250.85077719]
 [4107.52556752]
 [3584.39167841]
 [5209.32478675]
 [3424.91714247]
 [4189.2912434

In [18]:
print(modelLR.intercept_)

coef = pd.Series(data=modelLR.coef_[0], index= X_train_fin.columns)
print(coef.sort_values())

[3148.69796357]
species_Chinstrap   -462.699668
species_Adelie      -209.244540
sex_FEMALE          -177.806964
island_Dream         -68.778188
island_Torgersen      25.571056
island_Biscoe         43.207132
sex_MALE             177.806964
bill_depth_mm        624.295485
bill_length_mm       638.397402
species_Gentoo       671.944208
flipper_length_mm    934.923519
dtype: float64


### 5. Model Evaluation

In [19]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_val_pred)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)

print('MSE : {0:.3f} , RMSE : {1:.3F} '.format(mse , rmse))

MSE : 100172.576 , RMSE : 316.501 


### 6. Save Result

In [20]:
y_pred = modelLR.predict(X_test_fin)
print(y_pred)

[[4197.55494572]
 [5267.9519416 ]
 [4123.96097932]
 [3474.80119963]
 [4658.87033601]
 [4008.07723121]
 [3421.84840331]
 [3854.91719595]
 [3575.22272171]
 [3518.63456368]
 [3409.49038421]
 [5473.07357768]
 [3663.42540282]
 [4082.50396175]
 [3871.67651251]
 [4103.85900706]
 [4430.24104987]
 [3502.32850973]
 [3618.02623934]
 [3426.83427793]
 [5303.50220367]
 [4260.26376229]
 [5924.39073382]
 [4632.15287433]
 [5360.86969365]
 [5490.58169722]
 [5501.46773552]
 [4305.40363258]
 [3438.9595398 ]
 [4029.55468258]
 [4213.47875457]
 [3427.74523543]
 [3587.12530281]
 [4579.90840297]
 [3385.06555819]
 [3551.71322631]
 [3311.65815614]
 [3498.46788757]
 [3924.5911093 ]
 [4948.45177547]
 [3532.15861345]
 [5598.03910438]
 [4987.23937563]
 [4626.10996738]
 [5801.49878225]
 [3838.24705971]
 [4739.45280026]
 [3327.79894271]
 [5351.11159058]
 [4279.43472851]
 [4606.09650347]
 [4727.15427767]
 [4185.65325309]
 [3838.95845935]
 [3239.04427321]
 [4401.80650659]
 [4802.86785899]
 [4563.45731576]
 [3823.7705213

In [21]:
pd.DataFrame({'body_mass_g': y_pred[:,0]}).to_csv('./result.csv', index=False)