In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### The Validation Set Approach

In [2]:
auto_path="../../Data/Auto.csv"
auto=pd.read_csv(auto_path)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
auto.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [4]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int64  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 27.7+ KB


In [5]:
from sklearn.model_selection import train_test_split

auto["horsepower2"] = np.power(auto["horsepower"],2)
auto["horsepower3"] = np.power(auto["horsepower"],3)
auto_train, auto_validation = train_test_split(auto, test_size=196, random_state=42)

X_train = auto_train["horsepower"]
X_validation = auto_validation["horsepower"]

Y_train = auto_train["mpg"]
Y_validation = auto_validation["mpg"]

In [6]:
from sklearn.linear_model import LinearRegression

lr=LinearRegression()
lr.fit(X_train.to_numpy().reshape(-1,1),Y_train)

LinearRegression()

In [7]:
from sklearn.metrics import mean_squared_error

print("Training MSE:\t" + str(mean_squared_error(Y_train,lr.predict(X_train.to_numpy().reshape(-1,1)))))
print("Validation MSE:\t" + str(mean_squared_error(Y_validation,lr.predict(X_validation.to_numpy().reshape(-1,1)))))
print("Full Data Set MSE: " + str(mean_squared_error(auto["mpg"],lr.predict(auto["horsepower"].to_numpy().reshape(-1,1)))))

Training MSE:	23.161729199293692
Validation MSE:	25.5738781896844
Full Data Set MSE: 24.36780369448904


In [8]:
X_train = auto_train[["horsepower","horsepower2"]]
X_validation = auto_validation[["horsepower","horsepower2"]]

lr=LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression()

In [9]:
print("Training MSE:\t" + str(mean_squared_error(Y_train,lr.predict(X_train))))
print("Validation MSE:\t" + str(mean_squared_error(Y_validation,lr.predict(X_validation))))
print("Full Data Set MSE: " + str(mean_squared_error(auto["mpg"],lr.predict(auto[["horsepower","horsepower2"]]))))

Training MSE:	16.839716144803923
Validation MSE:	22.21802005003286
Full Data Set MSE: 19.528868097418393


In [10]:
X_train = auto_train[["horsepower","horsepower2","horsepower3"]]
X_validation = auto_validation[["horsepower","horsepower2","horsepower3"]]

lr=LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression()

In [11]:
print("Training MSE:\t" + str(mean_squared_error(Y_train,lr.predict(X_train))))
print("Validation MSE:\t" + str(mean_squared_error(Y_validation,lr.predict(X_validation))))
print("Full Data Set MSE: " + str(mean_squared_error(auto["mpg"],lr.predict(auto[["horsepower","horsepower2","horsepower3"]]))))

Training MSE:	16.585602858565903
Validation MSE:	22.667675435534484
Full Data Set MSE: 19.6266391470502


### LOOCV

In [12]:
from sklearn.model_selection import LeaveOneOut,cross_val_score

loocv=LeaveOneOut()

X = auto["horsepower"]
Y = auto["mpg"]

scores = cross_val_score(lr,X.to_numpy().reshape(-1,1),Y,cv=loocv,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

24.231513517929226


In [13]:
X = auto[["horsepower","horsepower2"]]
Y = auto["mpg"]

scores = cross_val_score(lr,X,Y,cv=loocv,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

19.248213124489677


In [14]:
X = auto[["horsepower","horsepower2","horsepower3"]]
Y = auto["mpg"]

scores = cross_val_score(lr,X,Y,cv=loocv,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

19.334984064029538


### K-Fold Cross-Validation

In [15]:
X = auto["horsepower"]
Y = auto["mpg"]

scores = cross_val_score(lr,X.to_numpy().reshape(-1,1),Y,cv=10,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

27.43993365233988


In [16]:
X = auto[["horsepower","horsepower2"]]
Y = auto["mpg"]

scores = cross_val_score(lr,X,Y,cv=10,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

21.235840055802235


In [17]:
X = auto[["horsepower","horsepower2","horsepower3"]]
Y = auto["mpg"]

scores = cross_val_score(lr,X,Y,cv=10,scoring='neg_mean_squared_error')
print(scores.mean()*(-1))

21.336606183228163


### The Bootstrap

In [18]:
portfolio_path="../../Data/Portfolio.csv"
portfolio=pd.read_csv(portfolio_path)
portfolio.head()

Unnamed: 0,X,Y
0,-0.895251,-0.234924
1,-1.562454,-0.885176
2,-0.41709,0.271888
3,1.044356,-0.734198
4,-0.315568,0.841983


In [19]:
portfolio.describe()

Unnamed: 0,X,Y
count,100.0,100.0
mean,-0.077132,-0.096945
std,1.062376,1.143782
min,-2.432764,-2.725281
25%,-0.888474,-0.885722
50%,-0.268889,-0.228708
75%,0.558093,0.806708
max,2.460336,2.565985


In [20]:
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       100 non-null    float64
 1   Y       100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB


In [21]:
def findAlpha(data):
    X_var=np.power(np.std(data["X"]),2)
    Y_var=np.power(np.std(data["Y"]),2)
    cov=data.cov()["X"]["Y"]
    alpha=(Y_var-cov)/(X_var+Y_var-2*cov)
    return alpha

In [22]:
print(findAlpha(portfolio))

0.5766511516108044


In [23]:
from sklearn.utils import resample

alphas = np.zeros(shape=(1000,1))
for count in range(1,1000):
    portfolio_resampled = resample(portfolio,n_samples=portfolio.shape[0])
    alphas[count-1] = findAlpha(portfolio_resampled)
    
print("Mean of Alpha: " + str(alphas.mean()))
print("Standard Deviation of Alpha: " + str(alphas.std()))

Mean of Alpha: 0.5801161372611958
Standard Deviation of Alpha: 0.09406852886183222


In [24]:
def findParameters(X,Y):
    lr=LinearRegression()
    lr.fit(X,Y)
    return [lr.intercept_] + list(lr.coef_)

In [25]:
X=auto["horsepower"]
Y=auto["mpg"]

print(findParameters(X.to_numpy().reshape(-1,1),Y))

[39.93586102117047, -0.15784473335365365]


In [26]:
parameters = np.zeros(shape=(1000,2))
for count in range(1,1000):
    auto_resampled = resample(auto,n_samples=auto.shape[0])
    X=auto_resampled["horsepower"]
    Y=auto_resampled["mpg"]
    parameters[count-1] = findParameters(X.to_numpy().reshape(-1,1),Y)
    
print("Mean of Intercept: " + str(parameters[:,0].mean()))
print("Standard Deviation of Intercept: " + str(parameters[:,0].std()))
print("Mean of Coefficient: " + str(parameters[:,1].mean()))
print("Standard Deviation of Coefficient: " + str(parameters[:,1].std()))

Mean of Intercept: 39.93092064244086
Standard Deviation of Intercept: 1.518590959600501
Mean of Coefficient: -0.1581588727925583
Standard Deviation of Coefficient: 0.008815965854272098
