# Palmer Penguins database ML implementation with Neural Networks
By: Francisco Olvera Hernández

In [104]:
import sklearn as sk
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [105]:
path = r"D:\Proyectos Machine Learning\Palmer Penguin database analysis with Python\penguins.csv" 
# Make sure to write here your CSV file path

## Reading data from the csv file

In [106]:
data = pd.read_csv(path)
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [107]:
data.shape # Just to know how many rows and columns my dataset has

(344, 8)

In [108]:
data.columns.values

array(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'], dtype=object)

With pandas.describe() it's easy to know some interesting values from the dataframe

In [109]:
data.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,342.0,342.0,342.0,342.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,5.459584,1.974793,14.061714,801.954536,0.818356
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.225,15.6,190.0,3550.0,2007.0
50%,44.45,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


Need to know column types

In [110]:
data.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
year                   int64
dtype: object

## Data cleaning

There are a few missing values (NaN) in the database so I'll check how many there are in order to clean them.

In [111]:
pd.isnull(data["bill_length_mm"]) # Let's take this column as an example, then clean rows with missing values.

0      False
1      False
2      False
3       True
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
314    False
315    False
316    False
317    False
318    False
319    False
320    False
321    False
322    False
323    False
324    False
325    False
326    False
327    False
328    False
329    False
330    False
331    False
332    False
333    False
334    False
335    False
336    False
337    False
338    False
339    False
340    False
341    False
342    False
343    False
Name: bill_length_mm, Length: 344, dtype: bool

In [112]:
pd.isnull(data["bill_length_mm"]).values.ravel().sum() #There are two rows with NaN values only in that column

2

Drop all rows which have null values.

In [113]:
data = data.dropna()  # Cleaning according to NaN values in all rows
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
12,Adelie,Torgersen,41.1,17.6,182.0,3200.0,female,2007
13,Adelie,Torgersen,38.6,21.2,191.0,3800.0,male,2007
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,male,2007


In [114]:
pd.isnull(data["bill_length_mm"]) # Check how many rows have valid data

0      False
1      False
2      False
4      False
5      False
6      False
7      False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
       ...  
314    False
315    False
316    False
317    False
318    False
319    False
320    False
321    False
322    False
323    False
324    False
325    False
326    False
327    False
328    False
329    False
330    False
331    False
332    False
333    False
334    False
335    False
336    False
337    False
338    False
339    False
340    False
341    False
342    False
343    False
Name: bill_length_mm, Length: 333, dtype: bool

### My training model will use 70% of the dataframe, rows are sorted by species so I'll shuffle the dataframe. 

In [115]:
data = sk.utils.shuffle(data)
data.reset_index(inplace = True, drop = True)  # Need to reset indices too
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Chinstrap,Dream,51.3,18.2,197.0,3750.0,male,2007
1,Gentoo,Biscoe,45.2,15.8,215.0,5300.0,male,2008
2,Adelie,Dream,41.1,17.5,190.0,3900.0,male,2009
3,Adelie,Biscoe,42.0,19.5,200.0,4050.0,male,2008
4,Adelie,Biscoe,36.4,17.1,184.0,2850.0,female,2008
5,Chinstrap,Dream,52.0,19.0,197.0,4150.0,male,2007
6,Gentoo,Biscoe,43.5,15.2,213.0,4650.0,female,2009
7,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,female,2009
8,Adelie,Biscoe,36.5,16.6,181.0,2850.0,female,2008
9,Adelie,Dream,41.5,18.5,201.0,4000.0,male,2009


### Need to make a backup to compare with all the rows after all experimentation and select only 70% of rows for training.

In [116]:
backup = data.copy() # Saving the 100% dataframe in backup

In [117]:
data = data.sample(frac = 0.7)
data.reset_index(inplace = True, drop = True)
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Gentoo,Biscoe,42.6,13.7,213.0,4950.0,female,2008
1,Gentoo,Biscoe,47.8,15.0,215.0,5650.0,male,2007
2,Chinstrap,Dream,48.5,17.5,191.0,3400.0,male,2007
3,Adelie,Dream,39.5,17.8,188.0,3300.0,female,2007
4,Chinstrap,Dream,52.0,18.1,201.0,4050.0,male,2007
5,Adelie,Dream,39.2,18.6,190.0,4250.0,male,2009
6,Gentoo,Biscoe,48.2,14.3,210.0,4600.0,female,2007
7,Gentoo,Biscoe,48.4,16.3,220.0,5400.0,male,2008
8,Adelie,Biscoe,39.7,17.7,193.0,3200.0,female,2009
9,Gentoo,Biscoe,45.1,14.5,207.0,5050.0,female,2007


### MLPClassifier will need X (bill_length_mm and flipper_length_mm). The neural network will predict penguin species according to sp column.

In [118]:
encoder = LabelEncoder() # To change raw data to numerical values.
# For 100% dataframe
backup['sp'] = encoder.fit_transform(backup.species.values) # Adding a new column with species numerical values
# For 70% dataframe
data['sp'] = encoder.fit_transform(data.species.values)  
print(data.sp.unique()) # Gentoo - 2   Chinstrap - 1    Adelie - 0
data 

[2 1 0]


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,sp
0,Gentoo,Biscoe,42.6,13.7,213.0,4950.0,female,2008,2
1,Gentoo,Biscoe,47.8,15.0,215.0,5650.0,male,2007,2
2,Chinstrap,Dream,48.5,17.5,191.0,3400.0,male,2007,1
3,Adelie,Dream,39.5,17.8,188.0,3300.0,female,2007,0
4,Chinstrap,Dream,52.0,18.1,201.0,4050.0,male,2007,1
5,Adelie,Dream,39.2,18.6,190.0,4250.0,male,2009,0
6,Gentoo,Biscoe,48.2,14.3,210.0,4600.0,female,2007,2
7,Gentoo,Biscoe,48.4,16.3,220.0,5400.0,male,2008,2
8,Adelie,Biscoe,39.7,17.7,193.0,3200.0,female,2009,0
9,Gentoo,Biscoe,45.1,14.5,207.0,5050.0,female,2007,2


## Setting the Multi-Layer Perceptron

In [119]:
Xtrain = data[['bill_length_mm', 'flipper_length_mm']]  # only 70%
Xtest = backup[['bill_length_mm', 'flipper_length_mm']] # all data

In [120]:
Ytrain = data['sp']  # only 70%
Ytest = backup['sp'] # all data

### Using StandardScaler to fit with mean and std values for preprocessing.

In [121]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain = scaler.transform(Xtrain) # IF YOU DON'T PREPROCESS DATA, PREDICTIONS MIGHT BE WRONG
Xtest = scaler.transform(Xtest)

## Training phase with train data 70%

In [122]:
# For small datasets it's better to use lbfgs than adam solver. 
# My neural network will have 3 hidden layers with 15 perceptrons, sometimes alpha is used as 0.00001 but according to some books
# and theory on neural networks a 0.01 alpha will optimize the learning rate and get final weights faster.
mlp = MLPClassifier(hidden_layer_sizes=(15, 15, 15), max_iter=1000, alpha=0.01, solver='lbfgs', random_state=None, tol=1e-4)
mlp.fit(Xtrain, Ytrain)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(15, 15, 15), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## A few predictions with trained data

In [123]:
predictions = mlp.predict(Xtrain) # With all trained bill and flipper lengths.
predict_example = mlp.predict(Xtrain[[0, 232]])  # Let's check what species the first and last penguin are.
predict_random_values = mlp.predict([[47.4,212.0]]) # This is a random penguin
                                                    # Remember: Gentoo - 2   Chinstrap - 1    Adelie - 0

print(predict_example)
print(predict_random_values)
print(predictions)
print("\n",classification_report(Ytrain, predictions))

[2 0]
[0]
[2 2 1 0 1 0 2 2 0 2 2 0 2 2 2 2 2 1 2 0 0 1 0 0 1 2 0 1 2 0 0 1 0 1 2 0 0
 1 2 1 0 2 0 0 0 2 2 1 2 2 0 2 0 2 2 2 0 0 0 2 2 1 0 0 1 0 2 1 2 0 1 2 0 2
 2 2 0 2 1 0 0 0 2 0 0 1 0 0 1 0 0 0 1 1 1 0 1 2 2 0 0 0 0 0 2 1 0 0 0 2 0
 0 0 1 0 1 2 0 0 0 1 2 2 2 0 2 1 0 0 0 0 0 0 0 0 1 2 0 2 0 0 2 1 2 2 1 0 2
 2 1 2 2 2 2 0 2 1 0 2 1 2 2 2 2 0 1 2 0 1 1 2 2 2 1 0 2 1 0 2 2 0 0 0 0 0
 2 1 1 1 0 0 2 2 2 2 0 2 2 0 2 2 0 0 0 0 2 0 2 2 0 0 2 2 1 0 0 1 1 2 2 0 1
 2 1 1 1 1 0 0 1 1 1 0]

               precision    recall  f1-score   support

           0       0.99      0.99      0.99        96
           1       0.98      0.98      0.98        52
           2       1.00      1.00      1.00        85

   micro avg       0.99      0.99      0.99       233
   macro avg       0.99      0.99      0.99       233
weighted avg       0.99      0.99      0.99       233



## Predictions with 100% dataset

In [124]:
final_predictions = mlp.predict(Xtest)
print(final_predictions)
print("\n",classification_report(Ytest, final_predictions))

[1 2 0 0 0 1 2 2 0 0 0 0 2 0 0 0 0 0 2 2 0 0 0 1 0 1 2 1 2 2 0 2 2 2 0 0 0
 2 1 1 1 1 2 2 0 1 1 0 2 1 2 0 0 0 1 1 1 1 2 2 2 0 1 0 0 2 0 1 0 2 0 0 0 1
 2 1 2 2 1 1 0 0 2 2 1 0 0 1 2 2 0 0 2 0 2 2 2 2 2 0 0 1 1 0 0 2 1 2 0 2 0
 1 2 0 2 2 0 1 2 2 2 2 1 2 2 0 0 2 0 0 1 1 0 2 2 0 0 0 1 0 2 0 0 0 1 0 1 2
 1 0 2 0 0 1 1 2 2 2 1 2 1 2 1 2 0 0 2 1 0 0 1 1 2 0 2 1 1 0 0 2 2 2 2 0 0
 1 0 2 2 0 2 0 1 2 1 2 2 0 0 2 0 1 2 0 2 1 1 1 2 2 0 1 0 2 2 0 0 1 0 0 2 0
 0 1 2 2 1 1 2 1 0 0 0 1 0 0 1 0 2 1 0 2 2 2 2 1 2 2 2 0 0 0 0 0 2 2 1 0 0
 1 1 0 0 0 0 0 2 2 2 0 0 0 2 0 2 0 2 0 2 0 2 2 0 2 2 2 2 1 2 0 2 0 2 0 2 0
 2 0 0 2 0 0 0 0 0 0 0 0 0 2 2 0 0 2 0 0 0 0 0 1 0 1 0 1 2 0 2 0 2 0 0 0 2]

               precision    recall  f1-score   support

           0       0.98      0.97      0.97       146
           1       0.93      0.96      0.94        68
           2       1.00      1.00      1.00       119

   micro avg       0.98      0.98      0.98       333
   macro avg       0.97      0.97      0.97       

I need to make a list to compare prediction output with dataset species values.
Remember that my 100% dataset is shuffled, so let's compare it with backup.

In [125]:
# Original dataset with species from csv
_list = [] # A list with two lengths and species id
species_backup = []
for row in backup.itertuples(index=True, name='Pandas'):
    dataset_list = []
    dataset_list.append(row.bill_length_mm)
    dataset_list.append(row.flipper_length_mm)
    dataset_list.append(row.sp)
    species_backup.append(row.sp)
    _list.append(dataset_list)
print(_list)

[[51.3, 197.0, 1], [45.2, 215.0, 2], [41.1, 190.0, 0], [42.0, 200.0, 0], [36.4, 184.0, 0], [52.0, 197.0, 1], [43.5, 213.0, 2], [47.2, 214.0, 2], [36.5, 181.0, 0], [41.5, 201.0, 0], [42.4, 181.0, 1], [40.2, 193.0, 0], [48.7, 222.0, 2], [36.7, 193.0, 0], [38.3, 189.0, 0], [37.3, 191.0, 0], [35.0, 192.0, 0], [41.5, 195.0, 0], [42.6, 213.0, 2], [50.2, 218.0, 2], [38.9, 190.0, 0], [40.8, 208.0, 0], [39.5, 178.0, 0], [50.0, 196.0, 1], [38.1, 187.0, 0], [47.0, 185.0, 1], [43.3, 209.0, 2], [55.8, 207.0, 1], [45.3, 210.0, 2], [47.4, 212.0, 2], [40.3, 195.0, 0], [49.8, 230.0, 2], [47.5, 209.0, 2], [53.4, 219.0, 2], [37.0, 185.0, 0], [41.4, 191.0, 0], [35.6, 191.0, 0], [46.4, 221.0, 2], [50.8, 210.0, 1], [47.6, 195.0, 1], [50.5, 200.0, 1], [52.2, 197.0, 1], [50.0, 224.0, 2], [46.1, 215.0, 2], [36.3, 190.0, 0], [43.2, 187.0, 1], [50.9, 196.0, 1], [38.1, 190.0, 0], [49.6, 216.0, 2], [51.0, 203.0, 1], [46.2, 221.0, 2], [38.1, 181.0, 0], [39.2, 196.0, 0], [38.6, 188.0, 0], [42.5, 187.0, 1], [47.5, 19

In [126]:
# Original dataset with species from final predictions
_list_predictions = [] # A list with two lengths and species id
i=0
for row in backup.itertuples(index=True, name='Pandas'):
    dataset_list_p = []
    dataset_list_p.append(row.bill_length_mm)
    dataset_list_p.append(row.flipper_length_mm)
    dataset_list_p.append(final_predictions[i])
    _list_predictions.append(dataset_list_p)
    i+=1
print(_list_predictions)

[[51.3, 197.0, 1], [45.2, 215.0, 2], [41.1, 190.0, 0], [42.0, 200.0, 0], [36.4, 184.0, 0], [52.0, 197.0, 1], [43.5, 213.0, 2], [47.2, 214.0, 2], [36.5, 181.0, 0], [41.5, 201.0, 0], [42.4, 181.0, 0], [40.2, 193.0, 0], [48.7, 222.0, 2], [36.7, 193.0, 0], [38.3, 189.0, 0], [37.3, 191.0, 0], [35.0, 192.0, 0], [41.5, 195.0, 0], [42.6, 213.0, 2], [50.2, 218.0, 2], [38.9, 190.0, 0], [40.8, 208.0, 0], [39.5, 178.0, 0], [50.0, 196.0, 1], [38.1, 187.0, 0], [47.0, 185.0, 1], [43.3, 209.0, 2], [55.8, 207.0, 1], [45.3, 210.0, 2], [47.4, 212.0, 2], [40.3, 195.0, 0], [49.8, 230.0, 2], [47.5, 209.0, 2], [53.4, 219.0, 2], [37.0, 185.0, 0], [41.4, 191.0, 0], [35.6, 191.0, 0], [46.4, 221.0, 2], [50.8, 210.0, 1], [47.6, 195.0, 1], [50.5, 200.0, 1], [52.2, 197.0, 1], [50.0, 224.0, 2], [46.1, 215.0, 2], [36.3, 190.0, 0], [43.2, 187.0, 1], [50.9, 196.0, 1], [38.1, 190.0, 0], [49.6, 216.0, 2], [51.0, 203.0, 1], [46.2, 221.0, 2], [38.1, 181.0, 0], [39.2, 196.0, 0], [38.6, 188.0, 0], [42.5, 187.0, 1], [47.5, 19

In [127]:
final_predictions.tolist() # Just to be able to compare both lists
final_predictions

array([1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0,
       0, 1, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 0, 2, 1, 1, 1, 1, 2, 2,
       0, 1, 1, 0, 2, 1, 2, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2,
       0, 1, 0, 2, 0, 0, 0, 1, 2, 1, 2, 2, 1, 1, 0, 0, 2, 2, 1, 0, 0, 1,
       2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 1, 1, 0, 0, 2, 1, 2, 0, 2,
       0, 1, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 2, 0, 0, 2, 0, 0, 1, 1,
       0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 1, 2, 1, 0, 2, 0, 0, 1,
       1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 0, 0, 2, 1, 0, 0, 1, 1, 2, 0, 2, 1,
       1, 0, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 2, 0, 2, 0, 1, 2, 1, 2, 2, 0,
       0, 2, 0, 1, 2, 0, 2, 1, 1, 1, 2, 2, 0, 1, 0, 2, 2, 0, 0, 1, 0, 0,
       2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2,
       2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0,

In [128]:
percentage = len(set(final_predictions) & set(species_backup)) / float(len(set(final_predictions) | set(species_backup))) * 100
print("Precision percentage amongst lists: ",percentage)

Precision percentage amongst lists:  100.0
