In [21]:
#Chapter 14:Data Science From Scratch

from typing import Tuple
from scratch.linear_algebra import Vector
from scratch.statistics import correlation, standard_deviation, mean
from scratch.statistics import num_friends_good, daily_minutes_good
from scratch.statistics import de_mean

In [10]:
def predict(alpha: float, beta: float, x_i: float) -> float:
    return beta * x_i + alpha

In [11]:
def error(alpha: float, beta: float, x_i: float, y_i: float) -> float:
    '''
    The error from predicting beta * x_i + alpha when the actual value is y_i
    '''
    return predict(alpha, beta, x_i) - y_i

In [12]:
def sum_of_sqerrors(alpha: float, beta: float, x:Vector, y:Vector) -> float:
    return sum(error(alpha, beta, x_i, y_i) ** 2
              for x_i, y_i in zip(x,y))

In [15]:
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float,float]:
    '''
    Given two vectors x and y, find the least-squares value of alpha and beta
    '''
    beta=correlation(x,y) * standard_deviation(y)/standard_deviation(x)
    alpha=mean(y)-beta*mean(x)
    return alpha,beta

In [16]:
x=[i for i in range(-11,110,10)]
y=[3*i-5 for i in x]

#should find that y=3x-5
assert least_squares_fit(x,y) == (-5,3)

In [20]:
alpha,beta=least_squares_fit(num_friends_good, daily_minutes_good)
assert 22.9 <alpha <23.0
assert 0.9  <beta  <0.905
alpha, beta

(22.94755241346903, 0.903865945605865)

In [22]:
def total_sum_of_squares(y: Vector) -> float:
    '''
    the total squared variation of y_y's from their mean
    '''
    return sum(v**2 for v in de_mean(y))

In [24]:
def r_squared(alpha: float, beta: float, x: Vector, y: Vector) -> float:
    '''
    the fraction of variation in y captured by the model, which equals
    1 - the fraction of variation in y not captured by the model
    '''
    return 1.0 - (sum_of_sqerrors(alpha,beta,x,y)/total_sum_of_squares(y))

In [25]:
rsq=r_squared(alpha, beta, num_friends_good, daily_minutes_good)
assert 0.328<rsq<0.330

In [26]:
import random
import tqdm
from scratch.gradient_descent import gradient_step

In [29]:
num_epochs=10000
random.seed(0)

guess=[random.random(), random.random()] #choose random value to start
learning_rate=0.00001

with tqdm.trange(num_epochs) as t:
    for _ in t:
        alpha, beta = guess
        
        #partial derivative of loss with respect to alpha
        grad_a = sum(2*error(alpha, beta, x_i, y_i)
                    for x_i, y_i in zip(num_friends_good,
                                       daily_minutes_good))
        
        #partial derivative of loss with respect to beta
        grad_b = sum(2*error(alpha, beta, x_i, y_i) * x_i
                    for x_i, y_i in zip(num_friends_good,
                                       daily_minutes_good))
        
        #compute loss to stick in the tqdm description
        loss = sum_of_sqerrors(alpha, beta,
                               num_friends_good, daily_minutes_good)
        t.set_description(f"loss: {loss:.3f}")
        
        #finally, update the guess
        guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)


loss: 13196.619: 100%|██████████████████████████████████████████████████████████| 10000/10000 [00:11<00:00, 877.98it/s]


In [30]:
alpha, beta = guess
assert 22.0 < alpha < 23.0
assert 0.9 < beta < 0.905

alpha, beta

(22.947552155340915, 0.9038659662765034)