In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set the style for better-looking plots
sns.set_style("whitegrid")

# Load the Palmer Penguins dataset
penguins = sns.load_dataset("penguins")

# Display the first few rows and basic information about the dataset
print(penguins.head())
print(penguins.info())

## Task 1: Exploring Distributions with Histograms

### Create histograms for ‘bill_length_mm’, ‘bill_depth_mm’, ‘flipper_length_mm’, and ‘body_mass_g’. 

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(8, 8))
sns.histplot(data=penguins, x='bill_length_mm', kde=True, ax=axes[0, 0])
sns.histplot(data=penguins, x='bill_depth_mm', kde=True, ax=axes[0, 1])
sns.histplot(data=penguins, x='flipper_length_mm', kde=True, ax=axes[1, 0])
sns.histplot(data=penguins, x='body_mass_g', kde=True, ax=axes[1, 1])
plt.tight_layout()
plt.show()

# Example with different number of bins
sns.histplot(data=penguins, x='body_mass_g', bins=35, kde=True)
plt.title('Body Mass Distribution (35 bins)')
plt.show()

### Experiment with different numbers of bins to see how it affects the visualization.

### Try using sns.histplot() with the ‘kde’ parameter set to True to overlay a kernel density estimate.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8, 8))
sns.histplot(data = penguins, x = 'bill_length_mm', kde = True, ax = axes[0,0], color = 'cornflowerblue')
sns.histplot(data = penguins, x = 'bill_depth_mm', kde = True, ax = axes[1,0], color = 'darkorange')
sns.histplot(data = penguins, x = 'body_mass_g', kde = True, ax = axes[0,1], color = 'forestgreen')
sns.histplot(data=penguins, x='flipper_length_mm', kde=True, ax=axes[1, 1], color = 'hotpink')
plt.tight_layout()
plt.show()

## Task 2: Examining Correlations

#### 1. Use sns.pairplot() to create a grid of scatter plots for all numeric variables.

In [None]:
sns.pairplot(data = penguins, hue = 'species')

#### 2. Modify the pairplot to show the species information using different colors.

#### 3. Interpret the pairplot: which variables seem to be most strongly correlated? Do you notice any patterns related to species?

Adelie and chinstrap seem to be closer in relationship basded on bill depth vs body mass and bill depth vs flipper length

## Task 3: Investigating Relationships with Regression Plots

#### 1. Create a regression plot (sns.regplot) showing the relationship between ‘flipper_length_mm’ and ‘body_mass_g’.

In [None]:
sns.regplot(data = penguins, x = 'flipper_length_mm', y = 'body_mass_g')

#### 2. Create another regplot showing the relationship between ‘bill_length_mm’ and ‘bill_depth_mm’.

In [None]:
sns.regplot(data = penguins, x = 'bill_length_mm', y = 'bill_depth_mm')
sns.lmplot(data = penguins,  x = 'bill_length_mm', y = 'bill_depth_mm', hue = 'species')

#### 3. Try adding the ‘species’ information to one of these plots using different colors. Hint: You might want to use sns.lmplot for this.

## Task 4: Joint Distribution Plots

#### 1. Create a joint plot for ‘flipper_length_mm’ and ‘body_mass_g’. 

In [None]:
sns.jointplot(data=penguins, x="flipper_length_mm", y="bill_depth_mm", hue="species", kind = "kde")

#### 2. Experiment with different kind parameters in the joint plot (e.g., ‘scatter’, ‘kde’, ‘hex’).

#### 3. Create another joint plot, this time for ‘bill_length_mm’ and ‘bill_depth_mm’, colored by species.

In [None]:
sns.jointplot(data=penguins, x="bill_length_mm", y="bill_depth_mm", kind = "hex")

# BONUS 
### Create a correlation matrix using the numerical columns in the dataset.

In [None]:
corr_matrix = penguins.select_dtypes(include=np.number).corr()

In [None]:
sns.heatmap(data = corr_matrix)