In [81]:
# Importing pandas and numpy

import pandas as pd
import numpy as np

In [82]:
# Loading our csv dataset

df = pd.read_csv("penguins_data.csv")

In [83]:
# Seeing the first 5 rows of the dataset 

df.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
3,,,,,0
4,36.7,19.3,193.0,3450.0,0


In [84]:
# Checking NaN value

df.isna().sum()

CulmenLength     2
CulmenDepth      2
FlipperLength    2
BodyMass         2
Species          0
dtype: int64

In [85]:
# Dropping the NaN values row: Because there is only two rows which have NaN values.

df = df.dropna()

In [86]:
df.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
4,36.7,19.3,193.0,3450.0,0
5,39.3,20.6,190.0,3650.0,0


In [87]:
# Checking unique categories of species

df["Species"].value_counts()

0    151
1    123
2     68
Name: Species, dtype: int64

In [88]:
# Separating features and target from the dataset

features = ["CulmenLength",	"CulmenDepth",	"FlipperLength", "BodyMass", "Species"]

In [89]:
# 'describe' function will retuen statstical summary of dataset

df[features].describe()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
count,342.0,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386,0.75731
std,5.459584,1.974793,14.061714,801.954536,0.763648
min,32.1,13.1,172.0,2700.0,0.0
25%,39.225,15.6,190.0,3550.0,0.0
50%,44.45,17.3,197.0,4050.0,1.0
75%,48.5,18.7,213.0,4750.0,1.0
max,59.6,21.5,231.0,6300.0,2.0


In [90]:
df.shape


(342, 5)

In [91]:
# Checking NaN Value

df.isna().sum()

CulmenLength     0
CulmenDepth      0
FlipperLength    0
BodyMass         0
Species          0
dtype: int64

In [92]:
from sklearn.preprocessing import MinMaxScaler

In [93]:
df["CulmenLength"] = MinMaxScaler().fit_transform(df[["CulmenLength"]])
df["CulmenDepth"] = MinMaxScaler().fit_transform(df[["CulmenDepth"]])
df["FlipperLength"] = MinMaxScaler().fit_transform(df[["FlipperLength"]])
df["BodyMass"] = MinMaxScaler().fit_transform(df[["BodyMass"]])

In [109]:
df.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,0.254545,0.666667,0.152542,0.291667,0
1,0.269091,0.511905,0.237288,0.305556,0
2,0.298182,0.583333,0.389831,0.152778,0
4,0.167273,0.738095,0.355932,0.208333,0
5,0.261818,0.892857,0.305085,0.263889,0


In [94]:
# Separating features and target from the dataset

features = ["CulmenLength",	"CulmenDepth",	"FlipperLength", "BodyMass"]

In [95]:
#created the features array
X = df[features].values

#created the Target array
Y = df["Species"].values

In [96]:
X[:5]

array([[0.25454545, 0.66666667, 0.15254237, 0.29166667],
       [0.26909091, 0.51190476, 0.23728814, 0.30555556],
       [0.29818182, 0.58333333, 0.38983051, 0.15277778],
       [0.16727273, 0.73809524, 0.3559322 , 0.20833333],
       [0.26181818, 0.89285714, 0.30508475, 0.26388889]])

In [97]:
Y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [98]:
#Split the data in training and tetsing set.

from sklearn.model_selection import train_test_split

In [99]:
# Training the model

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [100]:
# In this we are using the KNN, DT and RF alogorithm for taining the model

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 

In [101]:
# Here we are creating a model and training the model

model_RF = RandomForestClassifier().fit(X_train, Y_train)
model_DT = DecisionTreeClassifier().fit(X_train, Y_train)
model_KNN = KNeighborsClassifier(n_neighbors=12).fit(X_train, Y_train)

In [102]:
#testing
Y_pred_RF = model_RF.predict(X_test)
Y_pred_DT = model_DT.predict(X_test)
Y_pred_KNN = model_KNN.predict(X_test)

In [103]:
#evaluation metric

from sklearn.metrics import accuracy_score

In [110]:
# Calculating the models accuracy

acc_RF = accuracy_score(Y_test, Y_pred_RF)*100
acc_DT = accuracy_score(Y_test, Y_pred_DT)*100
acc_KNN = accuracy_score(Y_test, Y_pred_KNN)*100
print("The accuracy for Random Forest is: ", acc_RF)
print("The accuracy for Decision Tree is: ", acc_DT)
print("The accuracy for KNN is: ", acc_KNN)

The accuracy for Random Forest is:  98.05825242718447
The accuracy for Decision Tree is:  99.02912621359224
The accuracy for KNN is:  98.05825242718447


In [105]:
pwd

'C:\\Users\\Lenovo\\OneDrive\\Desktop\\Juypter_projects'

In [106]:
import joblib

In [107]:
# Save the model as pickel file
joblib.dump(model_RF, "penguin_model.pkl")

['penguin_model.pkl']

In [108]:
# load the model from the file
loaded_model = joblib.load("penguin_model.pkl")