In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
X_test = pd.read_csv('../Datasets/Penguin_X_test.csv')
X_train = pd.read_csv('../Datasets/Penguin_X_train.csv')
y_train = pd.read_csv('../Datasets/Penguin_y_train.csv')

In [3]:
X_train

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,,42.0,20.2,190.0
1,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0
2,Adelie,Torgersen,MALE,42.8,18.5,195.0
3,Chinstrap,Dream,MALE,53.5,19.9,205.0
4,Gentoo,Biscoe,MALE,50.2,14.3,218.0
...,...,...,...,...,...,...
235,Chinstrap,Dream,FEMALE,46.6,17.8,193.0
236,Gentoo,Biscoe,MALE,49.8,15.9,229.0
237,Adelie,Torgersen,FEMALE,34.6,17.2,189.0
238,Chinstrap,Dream,FEMALE,45.9,17.1,190.0


In [4]:
X_test

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm
0,Adelie,Torgersen,MALE,42.1,19.1,195.0
1,Gentoo,Biscoe,MALE,45.5,15.0,220.0
2,Adelie,Biscoe,MALE,40.6,18.8,193.0
3,Adelie,Dream,FEMALE,39.5,17.8,188.0
4,Gentoo,Biscoe,FEMALE,45.1,14.5,207.0
...,...,...,...,...,...,...
96,Gentoo,Biscoe,MALE,45.2,15.8,215.0
97,Adelie,Torgersen,FEMALE,40.3,18.0,195.0
98,Gentoo,Biscoe,FEMALE,46.5,14.5,213.0
99,Gentoo,Biscoe,MALE,49.6,15.0,216.0


In [5]:
y_train

Unnamed: 0,body_mass_g
0,4250.0
1,4650.0
2,4250.0
3,4500.0
4,5700.0
...,...
235,3800.0
236,5950.0
237,3200.0
238,3575.0


In [6]:
X_train.isna().sum()

species              0
island               0
sex                  8
bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
dtype: int64

In [7]:
X_test.isna().sum()

species              0
island               0
sex                  0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
dtype: int64

In [8]:
y_train.isna().sum()

body_mass_g    2
dtype: int64

In [26]:
Species = X_test['species'].copy()

X_train['sex'] = X_train['sex'].fillna(X_train['sex'].value_counts().idxmax())
X_train['bill_length_mm'] = X_train['bill_length_mm'].fillna(X_train['bill_length_mm'].mean())
X_train['bill_depth_mm'] = X_train['bill_depth_mm'].fillna(X_train['bill_depth_mm'].mean())
X_train['flipper_length_mm'] = X_train['flipper_length_mm'].fillna(X_train['flipper_length_mm'].mean())
y_train['body_mass_g'] = y_train['body_mass_g'].fillna(y_train['body_mass_g'].mean())

X_train_cat = X_train.select_dtypes('object').copy()
X_test_cat = X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train_cat)

X_train_ohe = ohe.transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

X_train_num = X_train.select_dtypes(exclude='object').copy()
X_test_num = X_test.select_dtypes(exclude='object').copy()

scaler = MinMaxScaler()
scaler.fit(X_train_num)

X_train_sca = scaler.transform(X_train_num)
X_test_sca = scaler.transform(X_test_num)

X_TRAIN = np.concatenate([X_train_ohe, X_train_sca], axis=1)
X_TEST = np.concatenate([X_test_ohe, X_test_sca], axis=1)

y_TRAIN = y_train['body_mass_g']

print(X_TRAIN.shape, X_TEST.shape, y_TRAIN.shape)

xtrain, xtest, ytrain, ytest = train_test_split(X_TRAIN, y_TRAIN, test_size=0.25, random_state=2022)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

def make_models(xtrain, xtest, ytrain, ytest):
    model1 = DecisionTreeRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model1 0', get_score(model1, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model1 = DecisionTreeRegressor(max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model1', d, get_score(model1, xtrain, xtest, ytrain, ytest))
    
    base_model = DecisionTreeRegressor(random_state=2022)
    model2 = AdaBoostRegressor(n_estimators=500, base_estimator=base_model, random_state=2022).fit(xtrain, ytrain)
    print('model2', get_score(model2, xtrain, xtest, ytrain, ytest))
    
    model3 = GradientBoostingRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model3', get_score(model3, xtrain, xtest, ytrain, ytest))
    
    model4 = RandomForestRegressor(random_state=2022).fit(xtrain, ytrain)
    print('model4', get_score(model4, xtrain, xtest, ytrain, ytest))
    
    for d in range(3,8):
        model1 = RandomForestRegressor(n_estimators=500, max_depth=d, random_state=2022).fit(xtrain, ytrain)
        print('model4', d, get_score(model4, xtrain, xtest, ytrain, ytest))
        
def get_score(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    ypred = model.predict(xtest)
    B = mean_squared_error(ytest, ypred)
    
    return f'{A:.4} {B:.4}'

make_models(xtrain, xtest, ytrain, ytest)

base_model = DecisionTreeRegressor(random_state=2022)
final_model = AdaBoostRegressor(n_estimators=500, base_estimator=base_model, random_state=2022).fit(xtrain, ytrain)
print('final_model', get_score(final_model, xtrain, xtest, ytrain, ytest))

y_pred = final_model.predict(X_TEST)

obj = {
    'Species' : Species,
    'Body_Mass_g' : y_pred
}

result = pd.DataFrame(obj)
result.to_csv('./result.csv', index=False)

(240, 11) (101, 11) (240,)
(180, 11) (60, 11) (180,) (60,)
model1 0 1.0 1.975e+05
model1 3 0.8575 1.671e+05
model1 4 0.8863 1.34e+05
model1 5 0.9127 1.289e+05
model1 6 0.9303 1.387e+05
model1 7 0.9485 1.502e+05
model2 0.9992 1.185e+05
model3 0.9637 1.402e+05
model4 0.9743 1.189e+05
model4 3 0.9743 1.189e+05
model4 4 0.9743 1.189e+05
model4 5 0.9743 1.189e+05
model4 6 0.9743 1.189e+05
model4 7 0.9743 1.189e+05
final_model 0.9992 1.185e+05
