# Palmer Archipelago (Antarctica) penguin

### The dataset contains data for 344 penguins. There are 3 different species of penguins in this dataset, <br>collected from 3 islands in the Palmer Archipelago, Antarctica.

![image.png](../Images/Penguins.png)

해당 데이터는 2014년 남극 팽귄 공동체 내 생태학적 성적 이형성과 환경적 변동성에 관한 연구 데이터이며, 다양한 종의 펭귄들의 서식지와 주요 신체 크기와 무게에 대한 7가지 정도의 데이터로 이루어져 있다. Palmer's penguin이라는 데이터 세트로도 알려져 있으며 시각화 패키지중 하나인 seaborn 패키지에 예제 데이터 세트로 존재한다.

주어진 학습용 데이터(penguin_X_train.csv, penguin_X_test.csv)를 활용하여 해당 펭귄의 무게를 예측하는 회귀 모형을 만든 후,<br> 이를 평가용 데이터(penguin_X_test.csv)에 적용하여 얻은 무게 예측값을 .csv 파일로 저장한다.

### Library & Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
X_test = pd.read_csv('../Datasets/Penguin_X_test.csv')
X_train = pd.read_csv('../Datasets/Penguin_X_train.csv')
y_train = pd.read_csv('../Datasets/Penguin_y_train.csv')

### 1. Data Exploration

In [3]:
X_test

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,MALE,42.1,19.1,195.0
1,Gentoo,Biscoe,MALE,45.5,15.0,220.0
2,Adelie,Biscoe,MALE,40.6,18.8,193.0
3,Adelie,Dream,FEMALE,39.5,17.8,188.0
4,Gentoo,Biscoe,FEMALE,45.1,14.5,207.0
...,...,...,...,...,...,...
96,Gentoo,Biscoe,MALE,45.2,15.8,215.0
97,Adelie,Torgersen,FEMALE,40.3,18.0,195.0
98,Gentoo,Biscoe,FEMALE,46.5,14.5,213.0
99,Gentoo,Biscoe,MALE,49.6,15.0,216.0


In [4]:
X_train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,,42.0,20.2,190.0
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0
...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0


In [5]:
y_train

Unnamed: 0,body_mass_g
0,4250.0
1,4650.0
2,4250.0
3,4500.0
4,5700.0
...,...
235,3800.0
236,5950.0
237,3200.0
238,3575.0


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB


### 2. Data Preprocessing

#### (1) Missing Value

In [7]:
train =  pd.concat([X_train, y_train], axis=1)
print(train.loc[(train.sex.isna()) | (train.bill_length_mm.isna()) | (train.bill_depth_mm.isna()) | (train.flipper_length_mm.isna()) | (train.body_mass_g.isna())])

    species     island  sex  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen  NaN            42.0           20.2              190.0   
6    Gentoo     Biscoe  NaN            44.5           14.3              216.0   
43   Gentoo     Biscoe  NaN             NaN            NaN                NaN   
66   Adelie  Torgersen  NaN            37.8           17.3              180.0   
88   Gentoo     Biscoe  NaN            47.3           13.8              216.0   
89   Adelie  Torgersen  NaN            37.8           17.1              186.0   
110  Gentoo     Biscoe  NaN            44.5           15.7              217.0   
229  Adelie  Torgersen  NaN             NaN            NaN                NaN   

     body_mass_g  
0         4250.0  
6         4100.0  
43           NaN  
66        3700.0  
88        4725.0  
89        3300.0  
110       4875.0  
229          NaN  


In [8]:
train = train.dropna()
train.reset_index(drop=True, inplace=True)

### 3. Data Modeling

#### (1) Data Split

In [9]:
Species = X_test['species'].copy()

X_train['sex'] = X_train['sex'].fillna(X_train['sex'].value_counts().idxmax())
X_train['bill_length_mm'] = X_train['bill_length_mm'].fillna(X_train['bill_length_mm'].mean())
X_train['bill_depth_mm'] = X_train['bill_depth_mm'].fillna(X_train['bill_depth_mm'].mean())
X_train['flipper_length_mm'] = X_train['flipper_length_mm'].fillna(X_train['flipper_length_mm'].mean())
y_train['body_mass_g'] = y_train['body_mass_g'].fillna(y_train['body_mass_g'].mean())

#### (2) One-Hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

X_train_cat = X_train.select_dtypes('object').copy()
X_test_cat = X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train_cat)

X_train_ohe = ohe.transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

#### (2) Scaling

In [11]:
from sklearn.preprocessing import MinMaxScaler

X_train_num = X_train.select_dtypes(exclude='object').copy()
X_test_num = X_test.select_dtypes(exclude='object').copy()

scaler = MinMaxScaler()
scaler.fit(X_train_num)

X_train_sca = scaler.transform(X_train_num)
X_test_sca = scaler.transform(X_test_num)

#### (3) Data Concat & Split

In [12]:
X_TRAIN = np.concatenate([X_train_ohe, X_train_sca], axis=1)
X_TEST = np.concatenate([X_test_ohe, X_test_sca], axis=1)

y_TRAIN = y_train['body_mass_g']

print(X_TRAIN.shape, X_TEST.shape, y_TRAIN.shape)

(240, 11) (101, 11) (240,)


In [13]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X_TRAIN, y_TRAIN, test_size=0.25, random_state=2022)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(180, 11) (60, 11) (180,) (60,)


### 4. Modeling

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

def make_models(xtrain, xtest, ytrain, ytest):
    model1 = DecisionTreeRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model1 0', get_score(model1, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model1 = DecisionTreeRegressor(max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model1', d, get_score(model1, xtrain, xtest, ytrain, ytest))
    
    base_model = DecisionTreeRegressor(random_state=2022)
    model2 = AdaBoostRegressor(n_estimators=500, base_estimator=base_model, random_state=2022).fit(xtrain, ytrain)
    print('model2', get_score(model2, xtrain, xtest, ytrain, ytest))
    
    model3 = GradientBoostingRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model3', get_score(model3, xtrain, xtest, ytrain, ytest))
    
    model4 = RandomForestRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model4', get_score(model4, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model1 = RandomForestRegressor(n_estimators=500, max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model4', d, get_score(model4, xtrain, xtest, ytrain, ytest))

### 5. Model Evaluation

In [17]:
from sklearn.metrics import mean_squared_error

def get_score(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    ypred = model.predict(xtest)
    B = mean_squared_error(ytest, ypred, squared=False)
    
    return f'{A:.4} {B:.4}'

In [18]:
make_models(xtrain, xtest, ytrain, ytest)

model1 0 1.0 444.5
model1 3 0.8575 408.8
model1 4 0.8863 366.0
model1 5 0.9127 359.0
model1 6 0.9303 372.4
model1 7 0.9485 387.5
model2 0.9992 344.2
model3 0.9637 374.5
model4 0.9743 344.9
model4 3 0.9743 344.9
model4 4 0.9743 344.9
model4 5 0.9743 344.9
model4 6 0.9743 344.9
model4 7 0.9743 344.9


In [19]:
base_model = DecisionTreeRegressor(random_state=2022)
final_model = AdaBoostRegressor(n_estimators=500, base_estimator=base_model, random_state=2022).fit(xtrain, ytrain)
print('final_model', get_score(final_model, xtrain, xtest, ytrain, ytest))

final_model 0.9992 344.2


### 6. Save Result

In [20]:
y_pred = final_model.predict(X_TEST)

obj = {
    'Species' : Species,
    'Body_Mass_g' : y_pred
}

result = pd.DataFrame(obj)
result.to_csv('./result.csv', index=False)