# Penguin Datasets EDA & its Predictive Model #

Create a predictive model using any supervised ML techniques that can predict
the species using the features.
Describe the characteristics of the datasets and try to find any underlying pattern to build
a hypothesis by using data exploratory tasks.
Implement the necessary pre-processing techniques to handle missing values or encode
the data into usable formats.

In [None]:
import pandas as pd

: 

In [None]:
df=pd.read_csv("penguins.csv")
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
print(f'Total rows: {df.shape[0]}\nTotal columns: {df.shape[1]}')

In [None]:
df.isnull().sum()

In [None]:
df['sex'].value_counts()

In [None]:
df['island'].value_counts()

In [None]:
df['species'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df.hist(figsize=(10,8))
plt.show()

In [None]:
for col in ['flipper_length_mm', 'bill_length_mm', 'body_mass_g']:
    sns.kdeplot(df[col])
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
for col in ['flipper_length_mm', 'bill_length_mm', 'body_mass_g']:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
sns.countplot(x='sex',data=df)
plt.show()

In [None]:
sns.countplot(x='island',data=df)
plt.show()

In [None]:
sns.countplot(x='species',data=df)
plt.show()

In [None]:
sns.pairplot(df,hue="species",diag_kind='kde')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True,cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
rows_missing = df[df.isnull().sum(axis=1) > 2]
print(rows_missing)

In [None]:
df= df[df.isnull().sum(axis=1) <= 2]

In [None]:
for col in ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']:
    df[col].fillna(df[col].median(), inplace=True)
df['sex'].fillna(df['sex'].mode()[0], inplace=True)

In [None]:
df.head()

### Categorical encoding of species ###

In [None]:
species_map={'Adelie':0,'Gentoo':1,'Chinstrap':2}
df['species']=df['species'].map(species_map)

: 

### Binary encoding of species ###

In [None]:
s_map={'MALE':1,'FEMALE':0}
df['sex']=df['sex'].map(s_map)

### One hot encoding ###

In [None]:
df = pd.get_dummies(df,columns=['island'],drop_first=True)

: 

### Separating features and target ###

In [None]:
x=df.drop('species',axis=1).values
y=df['species'].values

: 

### Splitting into train and test 90/10 ###

In [None]:
np.random.seed(42)
indices=np.random.permutation(len(x))
train_size=(int(0.8*len(x)))
train_idx=indices[:train_size]
text_idx=indices[train_size:]
x_train,x_test=x[train_idx],x[test_idx]
y_train,y_test=y[train_idx],y[test_idx]

# Implementing KNN #

In [None]:
def euclidean_distance(a,b):
    return np.sqrt(np.sum(a-b)**2)

def knn_predict(x_train,y_train,x_test,k=5):
    predictions=[]
    for x in x_test:
        distances=[euclidean_distance(x,X_train)for X_train in x_train]
        k_indices=np.argsort(distances)[:k]
        k_labels=y_train[k_indices]
        prediction=np.bicount(k_labels).argmax()
        predictions.append(prediction)
    return np.array(predictions)

## Making predictions ##

In [None]:
y_pred=knn_predict(x_train,y_train,x_test,k=5)

## Evaluation ##

In [None]:
accuracy=np.mean(y_pred==y_test)
print(f"Accuracy: {accuracy*100:.2f}%")