In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 


#loading the data

In [2]:
df = sns.load_dataset('penguins')
df.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
#size of dataset 
df.shape

(344, 7)

In [4]:
#information on the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
#calculate how many null values are there
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

#drop null values

In [6]:
df.dropna(inplace = True)

In [7]:
#Checking the dataset
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

#feature engineering 

#Using One hot encoding to transform categorical data into numeric

In [8]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [9]:
pd.get_dummies(df['sex']).head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [10]:
#dropping female column
sex = pd.get_dummies(df['sex'],drop_first= True)
sex.head()

Unnamed: 0,Male
0,1
1,0
2,0
4,0
5,1


In [11]:
#apply one-hot encoding to island 
df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [12]:
pd.get_dummies(df['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
4,0,0,1
5,0,0,1


In [13]:
island = pd.get_dummies(df['island'], drop_first= True)

In [14]:
island.head(5)

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


In [15]:
#Concatenate(add) the two dataframes into the original dataframe
new_data = pd.concat([df,island,sex], axis=1)
new_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,Male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,1,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0,1,1


In [16]:
#drop the repeated columns 
new_data.drop(['sex','island'], axis=1, inplace=True)
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181.0,3750.0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,1,1


##creating separate target variable 


In [17]:
y= new_data.species 
y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [18]:
y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [19]:
#using map function to convert categorical value to numeric
y = y.map({'Adelie': 0, 'Chinstrap':1, 'Gentoo':2 })
y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [21]:
##Dropping target variable Species in the dataset
new_data.drop('species', inplace=True , axis=1)
new_data.head()


Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181.0,3750.0,0,1,1
1,39.5,17.4,186.0,3800.0,0,1,0
2,40.3,18.0,195.0,3250.0,0,1,0
4,36.7,19.3,193.0,3450.0,0,1,0
5,39.3,20.6,190.0,3650.0,0,1,1


In [22]:
x=new_data

In [23]:
##Splitting the data into training and test data 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, random_state=0)
#random state is to ensure reusability hence when the code is run again you get the same result


##Training random forest classification on training set with entropy criteria 

In [26]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion= 'entropy', random_state=0)
classifier.fit(x_train,y_train)


#Predicting the test results 

In [28]:
y_pred = classifier.predict(x_test)
y_pred

array([0, 0, 2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1, 0, 2, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0,
       2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 0, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 0,
       0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2])

#Checking the accuracy of the algorithm using confusion matrix 

In [30]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score


In [31]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[48  0  0]
 [ 2 14  0]
 [ 0  0 36]]


In [33]:
#accuracy score in percentage 
accuracy_score(y_test,y_pred)*100

98.0

In [34]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.88      0.93        16
           2       1.00      1.00      1.00        36

    accuracy                           0.98       100
   macro avg       0.99      0.96      0.97       100
weighted avg       0.98      0.98      0.98       100



#Trying with different number of trees and gini criteria 

In [36]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 7, criterion = 'gini', random_state = 0)
classifier.fit(x_train,y_train)

In [37]:
y_pred = classifier.predict(x_test)

In [39]:
#accuracy score in terms of percentage 
accuracy_score(y_test,y_pred)*100

99.0

In [40]:
#Hence with more trees the model is more accurate 