# Linear Regression

In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [40]:
df = pd.read_csv('../data/weight-height.csv')

In [41]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [42]:
df.plot(kind='scatter',
        x='Height',
        y='Weight',
        title='Weight and Height in adults')

<Axes: title={'center': 'Weight and Height in adults'}, xlabel='Height', ylabel='Weight'>

In [43]:
df.plot(kind='scatter',
        x='Height',
        y='Weight',
        title='Weight and Height in adults')

# Here we're plotting the red line 'by hand' with fixed values
# We'll try to learn this line with an algorithm below
plt.plot([55, 78], [75, 250], color='red', linewidth=3)

[<matplotlib.lines.Line2D at 0x314f00dd0>]

In [44]:
def line(x, w=0, b=0):
    return x * w + b

In [45]:
x = np.linspace(55, 80, 100)

In [46]:
x

array([55.        , 55.25252525, 55.50505051, 55.75757576, 56.01010101,
       56.26262626, 56.51515152, 56.76767677, 57.02020202, 57.27272727,
       57.52525253, 57.77777778, 58.03030303, 58.28282828, 58.53535354,
       58.78787879, 59.04040404, 59.29292929, 59.54545455, 59.7979798 ,
       60.05050505, 60.3030303 , 60.55555556, 60.80808081, 61.06060606,
       61.31313131, 61.56565657, 61.81818182, 62.07070707, 62.32323232,
       62.57575758, 62.82828283, 63.08080808, 63.33333333, 63.58585859,
       63.83838384, 64.09090909, 64.34343434, 64.5959596 , 64.84848485,
       65.1010101 , 65.35353535, 65.60606061, 65.85858586, 66.11111111,
       66.36363636, 66.61616162, 66.86868687, 67.12121212, 67.37373737,
       67.62626263, 67.87878788, 68.13131313, 68.38383838, 68.63636364,
       68.88888889, 69.14141414, 69.39393939, 69.64646465, 69.8989899 ,
       70.15151515, 70.4040404 , 70.65656566, 70.90909091, 71.16161616,
       71.41414141, 71.66666667, 71.91919192, 72.17171717, 72.42

In [47]:
yhat = line(x, w=0, b=0)

In [48]:
yhat

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [49]:
df.plot(kind='scatter',
        x='Height',
        y='Weight',
        title='Weight and Height in adults')
plt.plot(x, yhat, color='red', linewidth=3)

[<matplotlib.lines.Line2D at 0x314f072d0>]

### Cost Function

In [50]:
def mean_squared_error(y_true, y_pred):
    s = (y_true - y_pred)**2
    return s.mean()

In [51]:
X = df[['Height']].values
y_true = df['Weight'].values

In [52]:
y_true

array([241.89356318, 162.31047252, 212.74085556, ..., 128.47531878,
       163.85246135, 113.64910268])

In [53]:
y_pred = line(X)

In [54]:
y_pred

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [55]:
mean_squared_error(y_true, y_pred.ravel())

27093.83757456157

### you do it!

Try changing the values of the parameters b and w in the line above and plot it again to see how the plot and the cost  change.

In [56]:
plt.figure(figsize=(10, 5))

# we are going to draw 2 plots in the same figure
# first plot, data and a few lines
ax1 = plt.subplot(121)
df.plot(kind='scatter',
        x='Height',
        y='Weight',
        title='Weight and Height in adults', ax=ax1)

# let's explore the cost function for a few values of b between -100 and +150
bbs = np.array([-100, -50, 0, 50, 100, 150])
mses = []  # we will append the values of the cost here, for each line
for b in bbs:
    y_pred = line(X, w=2, b=b)
    mse = mean_squared_error(y_true, y_pred)
    mses.append(mse)
    plt.plot(X, y_pred)

# second plot: Cost function
ax2 = plt.subplot(122)
plt.plot(bbs, mses, 'o-')
plt.title('Cost as a function of b')
plt.xlabel('b');

## Linear Regression with Keras

In [57]:
import tensorflow as tf 
print(tf.__version__)

2.17.0


In [58]:
from keras.models import Sequential

In [59]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD

In [60]:
model = Sequential()

In [78]:
model.add(Dense(1, input_shape=(1,2)))

In [79]:
model.summary()

In [80]:
model.compile(Adam(learning_rate=0.8), 'mean_squared_error')

In [81]:
model.fit(X, y_true, epochs=40)

Epoch 1/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 375us/step - loss: 12992.3262
Epoch 2/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313us/step - loss: 1273.5742
Epoch 3/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306us/step - loss: 1304.3790
Epoch 4/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314us/step - loss: 1291.8102
Epoch 5/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306us/step - loss: 1243.9067
Epoch 6/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300us/step - loss: 1047.0220
Epoch 7/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step - loss: 1041.4600
Epoch 8/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310us/step - loss: 1058.0093
Epoch 9/40
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316us/step - loss: 1042.3549
Epoch 10/40
[1m313/313[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x319bc5cd0>

In [82]:
y_pred = model.predict(X)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296us/step


In [83]:
print(y_pred)

[[[154.482  ]]

 [[152.60408]]

 [[154.57954]]

 ...

 [[150.78221]]

 [[152.69763]]

 [[150.06897]]]


In [84]:
df.plot(kind='scatter',
        x='Height',
        y='Weight',
        title='Weight and Height in adults')
plt.plot(X, y_pred, color='red')

ValueError: x and y can be no greater than 2D, but have shapes (10000, 1) and (10000, 1, 1)

In [33]:
W, B = model.get_weights()

In [34]:
w[0].shape

NameError: name 'w' is not defined

In [None]:
B

## Evaluating Model Performance

In [8]:
from sklearn.metrics import r2_score

In [9]:
print("The R2 score is {:0.3f}".format(r2_score(y_true, y_pred)))

NameError: name 'y_true' is not defined

### Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_true,
                                                    test_size=0.2)

NameError: name 'X' is not defined

In [12]:
len(X_train)

NameError: name 'X_train' is not defined

In [None]:
len(X_test)

In [None]:
W[0, 0] = 0.0
B[0] = 0.0
model.set_weights((W, B))

In [3]:
model.fit(X_train, y_train, epochs=50, verbose=0)

NameError: name 'model' is not defined

In [13]:
y_train_pred = model.predict(X_train).ravel()
y_test_pred = model.predict(X_test).ravel()

NameError: name 'model' is not defined

In [14]:
from sklearn.metrics import mean_squared_error as mse

In [15]:
print("The Mean Squared Error on the Train set is:\t{:0.1f}".format(mse(y_train, y_train_pred)))
print("The Mean Squared Error on the Test set is:\t{:0.1f}".format(mse(y_test, y_test_pred)))

NameError: name 'y_train' is not defined

In [16]:
print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(y_train, y_train_pred)))
print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(y_test, y_test_pred)))

NameError: name 'y_train' is not defined

# Classification

In [25]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD

ModuleNotFoundError: No module named 'tensorflow.keras'

In [26]:
df = pd.read_csv('../data/user_visit_duration.csv')

In [27]:
df.head()

Unnamed: 0,Time (min),Buy
0,2.0,0
1,0.683333,0
2,3.216667,1
3,0.9,0
4,1.533333,1


In [28]:
df.plot(kind='scatter', x='Time (min)', y='Buy');

In [29]:
model = Sequential()
model.add(Dense(1, input_shape=(1,), activation='sigmoid'))

NameError: name 'Sequential' is not defined

In [30]:
model.compile(SGD(learning_rate=0.5), 'binary_crossentropy', metrics=['accuracy'])

NameError: name 'model' is not defined

In [None]:
model.summary()

In [None]:
X = df[['Time (min)']].values
y = df['Buy'].values

model.fit(X, y, epochs=25)

In [None]:
ax = df.plot(kind='scatter', x='Time (min)', y='Buy',
             title='Purchase behavior VS time spent on site')

temp = np.linspace(0, 4)
ax.plot(temp, model.predict(temp), color='orange')
plt.legend(['model', 'data'])

In [None]:
temp_class = model.predict(temp) > 0.5

In [None]:
ax = df.plot(kind='scatter', x='Time (min)', y='Buy',
             title='Purchase behavior VS time spent on site')

temp = np.linspace(0, 4)
ax.plot(temp, temp_class, color='orange')
plt.legend(['model', 'data'])

In [None]:
y_pred = model.predict(X)
y_class_pred = y_pred > 0.5

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, y_class_pred)))

### Train/Test split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
params = model.get_weights()
params = [np.zeros(w.shape) for w in params]
model.set_weights(params)

In [None]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, model.predict(X) > 0.5)))

In [None]:
model.fit(X_train, y_train, epochs=25, verbose=0)

In [None]:
print("The train accuracy score is {:0.3f}".format(accuracy_score(y_train, model.predict(X_train) > 0.5)))
print("The test accuracy score is {:0.3f}".format(accuracy_score(y_test, model.predict(X_test) > 0.5)))

## Cross Validation

In [1]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

ModuleNotFoundError: No module named 'tensorflow.keras'

In [None]:
def build_logistic_regression_model():
    model = Sequential()
    model.add(Dense(1, input_shape=(1,), activation='sigmoid'))
    model.compile(SGD(learning_rate=0.5),
                  'binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
model = KerasClassifier(build_fn=build_logistic_regression_model,
                        epochs=25,
                        verbose=0)

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
cv = KFold(3, shuffle=True)

In [None]:
scores = cross_val_score(model, X, y, cv=cv)

In [None]:
scores

In [None]:
print("The cross validation accuracy is {:0.4f} ± {:0.4f}".format(scores.mean(), scores.std()))

## Confusion Matrix

In [2]:
from sklearn.metrics import confusion_matrix

In [3]:
confusion_matrix(y, y_class_pred)

NameError: name 'y' is not defined

In [None]:
def pretty_confusion_matrix(y_true, y_pred, labels=["False", "True"]):
    cm = confusion_matrix(y_true, y_pred)
    pred_labels = ['Predicted '+ l for l in labels]
    df = pd.DataFrame(cm, index=labels, columns=pred_labels)
    return df

In [None]:
pretty_confusion_matrix(y, y_class_pred, ['Not Buy', 'Buy'])

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
print("Precision:\t{:0.3f}".format(precision_score(y, y_class_pred)))
print("Recall:  \t{:0.3f}".format(recall_score(y, y_class_pred)))
print("F1 Score:\t{:0.3f}".format(f1_score(y, y_class_pred)))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, y_class_pred))

## Feature Preprocessing

### Categorical Features

In [85]:
df = pd.read_csv('../data/weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [86]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [87]:
pd.get_dummies(df['Gender'], prefix='Gender').head()

Unnamed: 0,Gender_Female,Gender_Male
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True


## Feature Transformations

#### 1) Rescale with fixed factor

In [88]:
df['Height (feet)'] = df['Height']/12.0
df['Weight (100 lbs)'] = df['Weight']/100.0

In [89]:
df.describe().round(2)

Unnamed: 0,Height,Weight,Height (feet),Weight (100 lbs)
count,10000.0,10000.0,10000.0,10000.0
mean,66.37,161.44,5.53,1.61
std,3.85,32.11,0.32,0.32
min,54.26,64.7,4.52,0.65
25%,63.51,135.82,5.29,1.36
50%,66.32,161.21,5.53,1.61
75%,69.17,187.17,5.76,1.87
max,79.0,269.99,6.58,2.7


#### MinMax normalization

In [90]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
df['Weight_mms'] = mms.fit_transform(df[['Weight']])
df['Height_mms'] = mms.fit_transform(df[['Height']])
df.describe().round(2)

Unnamed: 0,Height,Weight,Height (feet),Weight (100 lbs),Weight_mms,Height_mms
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,66.37,161.44,5.53,1.61,0.47,0.49
std,3.85,32.11,0.32,0.32,0.16,0.16
min,54.26,64.7,4.52,0.65,0.0,0.0
25%,63.51,135.82,5.29,1.36,0.35,0.37
50%,66.32,161.21,5.53,1.61,0.47,0.49
75%,69.17,187.17,5.76,1.87,0.6,0.6
max,79.0,269.99,6.58,2.7,1.0,1.0


#### 3) Standard normalization

In [91]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
df['Weight_ss'] = ss.fit_transform(df[['Weight']])
df['Height_ss'] = ss.fit_transform(df[['Height']])
df.describe().round(2)

Unnamed: 0,Height,Weight,Height (feet),Weight (100 lbs),Weight_mms,Height_mms,Weight_ss,Height_ss
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,66.37,161.44,5.53,1.61,0.47,0.49,0.0,0.0
std,3.85,32.11,0.32,0.32,0.16,0.16,1.0,1.0
min,54.26,64.7,4.52,0.65,0.0,0.0,-3.01,-3.15
25%,63.51,135.82,5.29,1.36,0.35,0.37,-0.8,-0.74
50%,66.32,161.21,5.53,1.61,0.47,0.49,-0.01,-0.01
75%,69.17,187.17,5.76,1.87,0.6,0.6,0.8,0.73
max,79.0,269.99,6.58,2.7,1.0,1.0,3.38,3.28


In [94]:
plt.figure(figsize=(15, 5))

for i, feature in enumerate(['Height', 'Height (feet)', 'Height_mms', 'Height_ss']):
    plt.subplot(1, 4, i+1)
    df[feature].plot(kind='hist', title=feature)
    plt.xlabel(feature);

# Machine Learning Exercises

## Exercise 1

You've just been hired at a real estate investment firm and they would like you to build a model for pricing houses. You are given a dataset that contains data for house prices and a few features like number of bedrooms, size in square feet and age of the house. Let's see if you can build a model that is able to predict the price. In this exercise we extend what we have learned about linear regression to a dataset with more than one feature. Here are the steps to complete it:

1. Load the dataset ../data/housing-data.csv
- plot the histograms for each feature
- create 2 variables called X and y: X shall be a matrix with 3 columns (sqft,bdrms,age) and y shall be a vector with 1 column (price)
- create a linear regression model in Keras with the appropriate number of inputs and output
- split the data into train and test with a 20% test size
- train the model on the training set and check its accuracy on training and test set
- how's your model doing? Is the loss growing smaller?
- try to improve your model with these experiments:
    - normalize the input features with one of the rescaling techniques mentioned above
    - use a different value for the learning rate of your model
    - use a different optimizer
- once you're satisfied with training, check the R2score on the test set

## Exercise 2

Your boss was extremely happy with your work on the housing price prediction model and decided to entrust you with a more challenging task. They've seen a lot of people leave the company recently and they would like to understand why that's happening. They have collected historical data on employees and they would like you to build a model that is able to predict which employee will leave next. They would like a model that is better than random guessing. They also prefer false negatives than false positives, in this first phase. Fields in the dataset include:

- Employee satisfaction level
- Last evaluation
- Number of projects
- Average monthly hours
- Time spent at the company
- Whether they have had a work accident
- Whether they have had a promotion in the last 5 years
- Department
- Salary
- Whether the employee has left

Your goal is to predict the binary outcome variable `left` using the rest of the data. Since the outcome is binary, this is a classification problem. Here are some things you may want to try out:

1. load the dataset at ../data/HR_comma_sep.csv, inspect it with `.head()`, `.info()` and `.describe()`.
- Establish a benchmark: what would be your accuracy score if you predicted everyone stay?
- Check if any feature needs rescaling. You may plot a histogram of the feature to decide which rescaling method is more appropriate.
- convert the categorical features into binary dummy columns. You will then have to combine them with the numerical features using `pd.concat`.
- do the usual train/test split with a 20% test size
- play around with learning rate and optimizer
- check the confusion matrix, precision and recall
- check if you still get the same results if you use a 5-Fold cross validation on all the data
- Is the model good enough for your boss?

As you will see in this exercise, the a logistic regression model is not good enough to help your boss. In the next chapter we will learn how to go beyond linear models.

This dataset comes from https://www.kaggle.com/ludobenistant/hr-analytics/ and is released under [CC BY-SA 4.0 License](https://creativecommons.org/licenses/by-sa/4.0/).

In [1]:
import streamlit as st
from streamlit_jupyter import streamlit_patcher
streamlit_patcher.jupyter()


ModuleNotFoundError: No module named 'streamlit_jupyter'

In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    np.random.randn(50, 3),
    columns=['a', 'b', 'c']
)

st.line_chart(df)


2024-12-23 11:07:56.035 
  command:

    streamlit run /opt/miniconda3/envs/ztdltest/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()