<a href="https://colab.research.google.com/github/asantucci/Python-Workshop/blob/main/Appendix_Scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-learn



In [None]:
!pip install --upgrade scikit-learn==0.23.2
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import pandas as pd

## Linear Regression


Let's start slow !



In [None]:
N = 10
x = 1.0 + np.random.uniform(size=N)
y = 20.0 - 10.0 * x + 2.3 * x**2 + 0.2 * np.random.normal(size=N)

In [None]:
# TODO: Plot the data. x are heights (in m), y are hair thickness (in mm) (yes this is very realistic)

In [None]:
#@title Solution
plt.scatter(x,y)
plt.xlabel("Height [m]")
plt.ylabel("Hair thickness [mm]")
plt.show()

In [None]:
from sklearn import linear_model

# Learn
model = linear_model.LinearRegression()
X = x.reshape((-1,1))
model.fit(X, y)

# Predict
xx = np.linspace(1, 2, 10)
XX = xx.reshape((-1,1))
yy = model.predict(XX)

# Plot
plt.figure()
plt.scatter(x,y,c='r',label="Data points")
plt.plot(xx,yy,"-*b",label="Predictions")
plt.xlabel("Height [m]")
plt.ylabel("Hair thickness [mm]")
plt.legend()
plt.show()

## Exercice: Covid example

In [None]:
# Load data from NYT dataset
df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv",parse_dates=[0])
df.head(5)

In [None]:
# Pivot in states
cases_states = df.pivot(index='date',columns='state',values='cases').fillna(0)
# Compute daily change and smooth accross 7 days
cases_states = cases_states.diff().fillna(0)
# Extract CA data
x = (cases_states.index - pd.to_datetime('2020-01-21')).total_seconds().to_numpy() / (3600.0 * 24.0)
y = cases_states['California'].to_numpy()
# Plot
plt.plot(x, y)
plt.xlabel('Days since 2020-01-21')
plt.ylabel('Cases per day (7-days average)')
plt.show()

In [None]:
from sklearn import linear_model

# Create train & test sets
start = 50
end = 130
x_train      = x[start:end].reshape((-1,1))
x_test       = x[end:].reshape((-1,1))
y_train      = y[start:end]
y_test       = y[end:]

In [None]:
# 1. Use `linear_model.PoissonRegressor()` as a model
# 2. Train the model on x_train and y_train
# 3. Compute `y_pred`, the prediction based on `x_test`

In [None]:
#@title Solution

# Fit
model = linear_model.PoissonRegressor()
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

In [None]:
# Plot
plt.figure()
plt.plot(x_train,y_train,'-r',label="Training set")
plt.plot(x_test,y_pred,'-b',label="Predictions")
plt.plot(x_test,y_test,'-g',label="Truth")
plt.xlabel("Days")
plt.ylabel("Number of death")
plt.legend()
plt.title("California COVID-19 data (")
plt.show()

## Exercice (Linear Regression): What if we have outliers ?

In [None]:
# This creates data with an outlier
N = 10
x = 1.0 + np.random.uniform(size=N)
y = 20.0 - 10.0 * x + 2.3 * x**2 + 0.2 * np.random.normal(size=N)
y[5] = 30

In [None]:
# TODO (1)
# - Plot the data and notice the outlier.
#   Use plt.scatter

In [None]:
#@title Solution
plt.figure()
plt.scatter(x,y,c='r',label="Data points")
plt.xlabel("Height [m]")
plt.ylabel("Hair thickness [mm]")
plt.legend()
plt.show()

In [None]:
# TODO (2)
# - Repeat the code above using LinearRegression: fit, predict and plot the predictions (literally copy/paste the code here)
# Does it work well ?

In [None]:
#@title Solution
model = linear_model.LinearRegression()
X = x.reshape((-1,1))
model.fit(X, y)

# Predict
xx = np.linspace(1, 2, 10)
XX = xx.reshape((-1,1))
yy = model.predict(XX)

# Plot
plt.figure()
plt.scatter(x,y,c='r',label="Data points")
plt.plot(xx,yy,"-*b",label="Predictions")
plt.xlabel("Height [m]")
plt.ylabel("Hair thickness [mm]")
plt.legend()
plt.show()

In [None]:
# TODO (3)
# - Now use `HuberRegressor` (i.e. simply use `linear_model.HuberRegressor()` instead of `linear_model.LinearRegression()`)
# How does it compare ?

In [None]:
#@title Solution

# Learn, using a HuberRegressor
model = linear_model.HuberRegressor()
X = x.reshape((-1,1))
model.fit(X, y)

# Predict
xx = np.linspace(1, 2, 10)
XX = xx.reshape((-1,1))
yy = model.predict(XX)

# Plot
plt.figure()
plt.scatter(x,y,c='r',label="Data points")
plt.plot(xx,yy,"-*b",label="Predictions")
plt.xlabel("Height [m]")
plt.ylabel("Hair thickness [mm]")
plt.legend()
plt.show()

## Trees


In [None]:
from sklearn import tree

# Load the CSV
df_all = pd.read_csv("https://web.stanford.edu/~lcambier/pc/cancer_data.csv")
df_all.head(5)

In [None]:
# TODO: 
# - remove the columns "Unnamed: 32" and "id". Use `df.drop(...)`.
# - column 'diagnosis' has 'B' (Benign) and 'M' (Malign). Transform this into -1 and 1, respectively. Use `df.loc[...] = -1` for instance.
# - convert the whole dataframe to floats. Use `df.astype(...)`

In [None]:
#@title Solution

# Remove useless columns
df = df_all.drop(["Unnamed: 32","id"], axis=1)
# Transform diagnosis ('B'/'M') into -1/1
df.loc[df['diagnosis'] == 'B','diagnosis'] = -1.0
df.loc[df['diagnosis'] == 'M','diagnosis'] = 1.0
df = df.astype(float)

In [None]:
# Convert to numpy arrays
X = df.loc[:,df.columns != "diagnosis"].to_numpy()
Y = df.loc[:,"diagnosis"].to_numpy()

# Train
N = 500
X_train = X[0:N,:]   # N x 30 matrix (samples x features)
Y_train = Y[0:N]     # N      vector (samples)
model = tree.DecisionTreeClassifier()
model.fit(X_train, Y_train)

# Test
X_test = X[N:,:]    # M x 30 matrix (samples x features)
Y_test = Y[N:,]     # M      vector (samples)
Y_pred = model.predict(X_test) # Predicted values

In [None]:
# How can we compute the accuracy ?
# Y_test is full of 1/-1, and same for Y_pred

In [None]:
#@title Solution
accuracy = np.sum(Y_test == Y_pred) / len(Y_pred)
print(f"Accuracy is {100*accuracy}%")

## Unsupervised learning

![Old Faithful](https://cdn.yellowstoneparknet.com/images/content/2829_186_Old_Faithful_Geyser_Yellowstone_National_Park_lg.jpg)

In [None]:
# From https://www.stat.cmu.edu/~larry/all-of-statistics/=data/faithful.dat
df = pd.read_csv("https://stanford.edu/~lcambier/pc/old_faithful.csv")
df.head(5)

In [None]:
X = df.loc[:,['waiting','eruptions']].to_numpy()

In [None]:
# 1. What is the shape of X ?

In [None]:
#@title Solution
print(X.shape)

In [None]:
# 2. Plot the X's, i.e., for every row in X, plot a point in 2D space

In [None]:
#@title Solution
plt.scatter(X[:,0], X[:,1])
plt.xlabel('Waiting (mins)')
plt.ylabel('Eruption time (mins)')
plt.title('Old Faithful Geyser')
plt.show()

In [None]:
# This scales the data around 0 with a std of 1
from sklearn import preprocessing
X_scaled = preprocessing.scale(X)

plt.scatter(X_scaled[:,0], X_scaled[:,1])
plt.xlabel('Waiting')
plt.ylabel('Eruption time')
plt.title('Old Faithful Geyser (scaled)')
plt.show()

In [None]:
# 3. Use the Scikit-learn function KMeans from sklearn.cluster to perform a Kmeans on X_scaled
# See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# Yes it's a little scary it's normal :-)

In [None]:
#@title Solution
import sklearn.cluster
model = sklearn.cluster.KMeans(n_clusters=2)
kmeans = model.fit(X_scaled)

In [None]:
# 4. Plot the centers over the data
# You can find them in kmeans.cluster_centers_

In [None]:
#@title Solution
Xcenters = kmeans.cluster_centers_
plt.scatter(X_scaled[:,0], X_scaled[:,1])
plt.scatter(Xcenters[:,0], Xcenters[:,1], s=400, marker='*', c='black')
plt.title('Old Faithful Geyser (scaled)')
plt.show()

In [None]:
# This generate 1000 random points over [40,100] x [1.5,5.5] (in original units)
Xpred = np.random.uniform(low=(40,1.5),high=(100,5.5),size=(1000,2))

# 5. Compute their cluster, make a nice plot:
# - Scale Xpred down (using preprocessing.scale)
# - Predict their cluster (using kmeans.predict)
# - Plot everything using red for cluster 0 and blue for cluster 1
# - Overlay the original data on top of it in black

In [None]:
#@title Solution
colors_map = {0:'red',1:'blue'}
Xpred_scaled = preprocessing.scale(Xpred)
ypred = kmeans.predict(Xpred_scaled)
plt.scatter(Xpred[:,0], Xpred[:,1], c=[colors_map[i] for i in ypred])
plt.scatter(X[:,0], X[:,1], c='black')
plt.xlabel('Waiting')
plt.ylabel('Eruption time')
plt.title('Old Faithful Geyser')
plt.show()