# Penguins dataset

https://allisonhorst.github.io/palmerpenguins/

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import sklearn.tree

In [None]:
penguins = sns.load_dataset("penguins")

In [None]:
penguins.info()

In [None]:
penguins.dropna(subset=['flipper_length_mm'],inplace=True)

In [None]:
penguins.info()

In [None]:
sns.pairplot(data=penguins, hue='species')

In [None]:
sns.scatterplot(data=penguins, 
                x='flipper_length_mm',
                y='body_mass_g')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(penguins[['flipper_length_mm']],
                                   penguins['body_mass_g'],
                                   random_state=0)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

In [None]:
sns.scatterplot(data=penguins, 
                x='flipper_length_mm',
                y='body_mass_g')

plt.plot(x_test, y_test, 'ko')

x_sample = pd.DataFrame({'flipper_length_mm':[170,230]})
y_pred = lin_reg.predict(x_sample)
plt.plot(x_sample,y_pred)

In [None]:
test_score = lin_reg.score(x_test, y_test)
print(f"R2 of Linear Regression: {test_score:.2f}")

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=20)
tree_reg.fit(x_train, y_train)

In [None]:
x_test.sort_values('flipper_length_mm')

In [None]:
sns.scatterplot(data=penguins, 
                x='flipper_length_mm',
                y='body_mass_g')

plt.plot(x_test, y_test, 'ko')

x_sample = pd.DataFrame({'flipper_length_mm':[i/10 for i in range(1700,2300)]})
y_pred = tree_reg.predict(x_sample)
plt.plot(x_sample,y_pred,'r',linewidth=3)

In [None]:
test_score = tree_reg.score(x_test, y_test)
print(f"R2 of Linear Regression: {test_score:.2f}")

In [None]:
r2score = r2_score(y_test, tree_reg.predict(x_test))
msescore = mean_squared_error(y_test, tree_reg.predict(x_test))

In [None]:
print("Testing score R2 : ", r2score)
print("Testing score StdDev : ", np.sqrt(msescore))

In [None]:
import ipywidgets

In [None]:
x_sample = pd.DataFrame({'flipper_length_mm':[i/10 for i in range(1700,2300)]})

def treeline(md=1):
    tree_reg = DecisionTreeRegressor(max_depth=md)
    tree_reg.fit(x_train, y_train)
    sns.scatterplot(data=penguins, 
                    x='flipper_length_mm',
                    y='body_mass_g')
    plt.plot(x_test, y_test, 'ko')
    y_pred = tree_reg.predict(x_sample)
    plt.plot(x_sample,y_pred,'r',linewidth=3)
    r2score = r2_score(y_test, tree_reg.predict(x_test))
    print("Testing score R2 : ", r2score)
    plt.show()
    
ipywidgets.interactive(treeline,md=(1,30))

In [None]:
tree_reg = DecisionTreeRegressor(max_depth=3)
tree_reg.fit(x_train, y_train)
sns.scatterplot(data=penguins, 
                x='flipper_length_mm',
                y='body_mass_g')
plt.plot(x_test, y_test, 'ko')
y_pred = tree_reg.predict(x_sample)
plt.plot(x_sample,y_pred,'r',linewidth=3)
r2score = r2_score(y_test, tree_reg.predict(x_test))
print("Testing score R2 : ", r2score)
plt.show()
text_representation = sklearn.tree.export_text(tree_reg)
print(text_representation)
plt.figure(figsize=(12,8))
sklearn.tree.plot_tree(tree_reg, 
               feature_names=['flipper_length_mm'],  
               class_names=['body_mass_g'],
               filled=True);


In [None]:
# !pip install dtreeviz

In [None]:
import dtreeviz

In [None]:
%%capture --no-display

vizmodel = dtreeviz.model(tree_reg, 
         x_train, 
         y_train,
         feature_names=['flipper_length_mm'],
         class_names=['body_mass_g'],
         target_name="body_mass_g")
vizmodel.view()

## Exercise

Adapt the above to do Linear Regression and Decision Tree Regression on last week's breast cancer dataset.
* Use "mean radius" as the feature variable and "mean concave points" as the target variable
* I have included a couple cells to get you started.

In [None]:
import sklearn.datasets
import sklearn.model_selection

x,y = sklearn.datasets.load_breast_cancer(return_X_y=True,
                                          as_frame=True)

In [None]:
x['target'] = y

In [None]:
x.corr()['mean radius']

In [None]:
sns.scatterplot(data=x, 
                x='mean radius', 
                y='mean concave points')

In [None]:
# split into training/test sets



In [None]:
# initialize the linear regression model


In [None]:
# train the linear regression model


In [None]:
# plot the trained linear model with the data


In [None]:
# print the R2 score of the model


## Decision Tree

In [None]:
# initialize the decision tree regression model


In [None]:
# train the decision tree regression model


In [None]:
# plot the decision boundary with the data


In [None]:
# print out a text representation of the tree


In [None]:
# plot a visualization of the tree


In [None]:
# print the R2 score of the model


In [None]:
# make a sample data point of mean radius
# and predict its corresponding mean concave points value
