# MATH 608 Week 16 Worksheet
## Penguins Dataset Analysis

In [33]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline

In [40]:
# Load the dataset
url = "https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv"
penguins = pd.read_csv(url)
penguins = penguins.dropna()

# Create binary target for Biscoe island
penguins['is_biscoe'] = (penguins['island'] == 'Biscoe').astype(int)

# Prepare the features and target
X = penguins[['bill_depth_mm']]
y = penguins['is_biscoe']
y=np.reshape(y,-1)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=608)
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,is_biscoe
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007,0


## 1. Simple Logistic Regression

In [38]:
# Fit logistic regression model
lr_model = LogisticRegression(random_state=608)
#y_train=np.reshape(y_train,-1)
lr_model.fit(X_train, y_train)

# Predict probability for bill depth of 17.1 mm
new_penguin = np.array([[17.1]])
prob_biscoe = lr_model.predict_proba(new_penguin)[0][1]
print(f"Probability of being from Biscoe island for bill depth of 17.1 mm: {prob_biscoe}")
prediction = lr_model.predict(new_penguin)[0]
print(f"Predicted class: {'Biscoe' if prediction == 1 else 'Not Biscoe'}")

new_penguin = np.array([[18.1]])
prob_biscoe = lr_model.predict_proba(new_penguin)[0][1]
print(f"Probability of being from Biscoe island for bill depth of 18.1 mm: {prob_biscoe}")
# Predict class
prediction = lr_model.predict(new_penguin)[0]
print(f"Predicted class: {'Biscoe' if prediction == 1 else 'Not Biscoe'}")

# Confusion matrix and accuracy
y_pred = lr_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print(f"Accuracy: {accuracy}")

Probability of being from Biscoe island for bill depth of 17.1 mm: 0.5052002966072995
Predicted class: Biscoe
Probability of being from Biscoe island for bill depth of 18.1 mm: 0.27457712279904906
Predicted class: Not Biscoe
Confusion Matrix:
[[24  8]
 [10 25]]
Accuracy: 0.7313432835820896




## 2. Penalized Logistic Regression

### Prepare features with interaction term

In [42]:
# Prepare features with interaction term
X_interaction = penguins[['bill_depth_mm', 'bill_length_mm']].copy()
X_interaction['interaction'] = X_interaction['bill_depth_mm'] * X_interaction['bill_length_mm']

# Split the data
X_train_inter, X_test_inter, y_train, y_test = train_test_split(X_interaction, y, test_size=0.2, random_state=608)

# Create pipeline with scaling and logistic regression
penalized_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
)

# Fit the model
penalized_model.fit(X_train_inter, y_train)

# Predict for specific penguin
new_penguin_inter = np.array([[17.1, 45, 17.1 * 45]])
prob_biscoe_inter = penalized_model.predict_proba(new_penguin_inter)[0][1]
print(f"Probability of being from Biscoe island bill depth of 17.1 mm and bill length of 45 mm: {prob_biscoe_inter:.4f}")

# Predict class
prediction_inter = penalized_model.predict(new_penguin_inter)[0]
print(f"Predicted class: {'Biscoe' if prediction_inter == 1 else 'Not Biscoe'}")

# Confusion matrix and accuracy
y_pred_inter = penalized_model.predict(X_test_inter)
cm_inter = confusion_matrix(y_test, y_pred_inter)
accuracy_inter = accuracy_score(y_test, y_pred_inter)
print("Confusion Matrix:")
print(cm_inter)
print(f"Accuracy: {accuracy_inter:.4f}")


Probability of being from Biscoe island bill depth of 17.1 mm and bill length of 45 mm: 0.5099
Predicted class: Biscoe
Confusion Matrix:
[[25  7]
 [10 25]]
Accuracy: 0.7463




Based on the comparison of model accuracies between Simple and Penalized -  Penalized is a better accuracy
Penalized is better as it take consider more columns to make perdictions - ['bill_depth_mm', 'bill_length_mm']


## 3. Exploring Alternative Models

In [41]:
# Try polynomial features
from sklearn.preprocessing import PolynomialFeatures

# Prepare polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = penguins[['bill_depth_mm', 'bill_length_mm','flipper_length_mm']]
X_poly_features = poly.fit_transform(X_poly)

# Split the data
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly_features, y, test_size=0.2, random_state=608)

# Create pipeline with scaling and logistic regression
poly_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, C=1.0)
)

# Fit the model
poly_model.fit(X_train_poly, y_train)

# Predict and evaluate
y_pred_poly = poly_model.predict(X_test_poly)
accuracy_poly = accuracy_score(y_test, y_pred_poly)
cm_poly = confusion_matrix(y_test, y_pred_poly)

print("Polynomial Features Model:")
print("Confusion Matrix:")
print(cm_poly)
print(f"Accuracy: {accuracy_poly:.4f}")

# Print comparison of model accuracies
print("\nModel Comparison:")
print(f"Simple Logistic Regression Accuracy: {accuracy:.4f}")
print(f"Interaction Term Model Accuracy: {accuracy_inter:.4f}")
print(f"Polynomial Features Model Accuracy: {accuracy_poly:.4f}")

Polynomial Features Model:
Confusion Matrix:
[[30  2]
 [10 25]]
Accuracy: 0.8209

Model Comparison:
Simple Logistic Regression Accuracy: 0.7313
Interaction Term Model Accuracy: 0.7463
Polynomial Features Model Accuracy: 0.8209
