### Libraries

In [None]:
# Get libraries
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from pycaret.classification import *
from pycaret.regression import *
from scipy.stats import chi2_contingency

### Data

In [None]:
# Get data
df = sns.load_dataset('penguins')
df.head()

In [None]:
df.shape

In [None]:
# df.info()

### Relationships between variables

#### General

In [None]:
# Plot categorical variables
fig = plt.figure(figsize=(15,2))
ax1 = fig.add_subplot(131)
df["species"].value_counts().plot(kind="barh")
ax1.set_title("species")
ax2 = fig.add_subplot(132)
df["island"].value_counts().plot(kind="barh")
ax2.set_title("island")
ax3 = fig.add_subplot(133)
df["sex"].value_counts().plot(kind="barh")
ax3.set_title("sex")
plt.tight_layout()

##### Co-occourrence

In [None]:
# Cooccurrence
fig = plt.figure(figsize=(12,4))

ax1 = fig.add_subplot(121)
sns.heatmap(pd.crosstab(
    df["species"],
    df["island"],
    normalize=False,
    margins=True,
    margins_name="Total",
),cmap="Blues", annot=True, cbar=False, fmt="g")
ax1.set_title("Cooccurrence - absolute")

ax2 = fig.add_subplot(122)
sns.heatmap(pd.crosstab(
    df["species"],
    df["island"],
    normalize=True,
    margins=True,
    margins_name="Total",
),cmap="Blues", annot=True, cbar=False, fmt=".3g")
ax2.set_title("Cooccurrence - normalized")

plt.tight_layout()

##### Chi^2 test for independence

In [None]:
contingency_table = pd.crosstab(df["species"], df["island"])
contingency_table

In [None]:
stat, p, dof, expected = chi2_contingency(contingency_table)
# interpret p-value 
alpha = 0.05
print("p value is " + str(p)) 
if p <= alpha: 
    print('Dependent (reject H0)') 
else: 
    print('Independent (H0 holds true)')

##### Correlation

In [None]:
# Correlation
r_2 = df[["flipper_length_mm", "body_mass_g"]].corr().iloc[0][1]

fig = plt.figure(figsize=(7,5))
fig.add_subplot(111)
p1 = sns.regplot(data=df, x="flipper_length_mm", y="body_mass_g", fit_reg=True, marker="o", color="royalblue", scatter_kws={'s':40})
p1.text(185, 5750, f"R^2 = {r_2:.2f}", horizontalalignment='left', size='large', color='dimgray', weight='normal')
plt.grid()
plt.title("Correlation")
plt.tight_layout()

In [None]:
# Chi^2 for independence

#### Machine learning

##### Given flipper_length_mm can we predict body_mass_g?

In [None]:
reg1 = setup(df[["flipper_length_mm", "body_mass_g"]], target='body_mass_g', session_id=123, log_experiment=True, experiment_name='regression_test')

In [None]:
best_model = compare_models(fold=5)

In [None]:
models()

In [None]:
lr = create_model('lr')

In [None]:
fig = plt.figure(figsize=(12,5))
fig.add_subplot(111)
plot_model(lr)

In [None]:
fig = plt.figure(figsize=(7,7))
fig.add_subplot(111)
plot_model(lr, plot='error')

##### Given the island, can we predict the species?

In [None]:
clf1 = setup(df[["island", "species"]], target='species', session_id=42, log_experiment=True, experiment_name='corr_test')

In [None]:
best_model = compare_models()

In [None]:
lr = create_model('lr')

In [None]:
tuned_lr = tune_model(lr)

In [None]:
plot_model(lr)

In [None]:
plot_model(lr, plot = 'confusion_matrix')

In [None]:
plot_model(lr, plot = 'class_report')

In [None]:
plot_model(lr, plot = 'pr')