In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

In [3]:
df['ocean_proximity'].value_counts()


<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [4]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]


In [5]:
columns = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households", 
    "median_income",
    "median_house_value"
]

df = df[columns]

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [7]:
df['population'].median()

1195.0

#So here I am taking the length of the entire dataframe.
and naming it N.

Then I will be creating the dataframes for validating, testing and training. 
In order to make sure I do not get decimals, I will be rounding them using the (int) function. 
the Train dataset is simply the subtraction of val and test from the total n. 

If I don't do this then the sum of val,test and train will be higher than the original n dataset because they will all be rounded up numbers.



In [8]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

arange is in essence creating a subset of numbers from 0 to the end of N. Then by using IDX, you can shuffle the range as you like. Seed allows for reproducing the data at a time in the future. So the instructor requested 42 so that we can all have the same results in the end. 

In [9]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)


df.iloc[idx]: This selects rows from df based on the order of indices in idx. Essentially, it's reordering or shuffling the rows of df based on the shuffled indices in idx.
The result is a new DataFrame df_shuffled where the rows have been shuffled or reordered based on the shuffled indices in idx.

For example, if df has 5 rows and idx is [3, 1, 4, 2, 0], then the first row of df_shuffled will be the fourth row of df, the second row of df_shuffled will be the second row of df, and so on.


n_train and val etc are only numbers. To now append the 20,20,60% split to the dataframe we need to use iloc to append the integer match to the df-shuffled dataframe.

We end up with an additional index and so the need to reset and drop is there.

In [10]:
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

Now we are creating numpy arrays which are essential matrices to the y values, which are the values needed for predictions. However, these are the ACTUAL values, not the predictions. We will be solving for these later. We are also applying logarithm to these values as well and then deleting median_house_value from the actual dataset so as not to throw off the regression model.


Lo1pg can reduce the impact of outliers. Which makes sense considering many times the distribution will be long tailed, and long tailed distribution necessitates outliers, sometimes even being 0. Which is why 1p is used, to ensure there are no zeros.

In [11]:
y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

Now we are creating the Linear Regression Model.
First we are creating a vector of ones which will be the bias term. The bias term is needed in the model and specifically for the feature model so as to give the model variance when computing. My understanding is that without 1's, the model will continue to go through the 0,0 portion of the diagram and lack variance, thus not allowing for variance in predictions.

From ChatGPT
Bias Term (Intercept):
The bias term (often referred to as the intercept) is indeed crucial for the linear regression model. It allows the regression line to not necessarily pass through the origin (0,0). Without the bias term, the regression line would be forced to go through the origin, which might not provide a good fit to the data in many cases.

Variance in Predictions:
The bias term doesn't directly provide "variance in predictions" in the sense of making predictions more diverse or spread out. Instead, it provides an offset. It allows the model to make predictions that are offset from what they would be if the regression line passed through the origin. This can be essential for capturing the underlying patterns in the data.



The stack is changing it from an an array to fit the X Matrix that it is now appended to.

X.T is the transpose, X is the original dataframe, and T is a numpy attribute that gives you the transpose. Then you multiply it by itself and get the XTX or Gram Matrix which then you solve for the inverse of it, and then multiply the X by the inverse, and multiply the Y and you get the weight.

The return is giving me the weight split into the bias term which is w[0] which signifies giving me the index 0 or the first column, and then w[1:] is giving me the first row and everything after. 

In [12]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


What's happening here is a function to fill all of the full values in the datatset instead of leaving them blank. This allows for the linear regression model to run. We could also solve for the mean of the null_values but this is how we've chosen here. The df.values = X is simply giving us the numpy.array or the matrix that we will run the regression with. 

In [13]:
def prepare_X(df, fillna_value):
    df = df.fillna(fillna_value)
    X = df.values
    return X

This portion is to solve for the level of error between the prediction and actual target variable.

It is squared to ensure the numbers are positive and naturally the largest numbers will be the greatest margins of error. Solving for the mean now gives us a general idea of how far off our model is. The higher the number the worse our model is. Apparently multiplying by 2 is not needed but it is done. 

Chatgpt

Multiplying by 2: Ah, the mysterious multiplication by 2! In the context of gradient descent optimization (a method to tweak the model to reduce error), the derivative of the MSE includes a factor of 2. To simplify calculations, some folks include this factor directly in the MSE. However, for evaluation purposes, this factor is often unnecessary. It's a bit like wearing two watches to ensure punctuality – a tad overzealous, but it has its reasons.

Squaring the error ensures that it is positive.



In [14]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

Mean of Total Bedrooms: The choice to fill missing values in the 'total_bedrooms' feature with its mean is likely based on domain knowledge or exploratory data analysis. 
Perhaps 'total_bedrooms' has missing values that need addressing, while other features might not have missing values or might be handled differently. 
Using the mean is a common strategy to fill in missing data because it preserves the overall distribution of the feature.

Preparing X_train: Spot on! You're using the prepare_X function to clean the training dataset. By filling in missing values in 'total_bedrooms' with its mean, 
you're ensuring that the model doesn't get tripped up by any gaps in the data. And yes, the train_linear_regression function is geared towards finding the optimal weights 
for the model based on the provided data. It's not the entire linear regression algorithm but rather a crucial part of it that determines how each feature influences the prediction.

Preparing X_val and Predicting: Absolutely right! You're preparing the validation dataset in the same way as the training dataset, ensuring consistency. With the weights (w_0 and w) obtained from training, you're making predictions on this validation set. The equation y_pred = w_0 + X_val.dot(w) is the linear regression prediction formula in action.

Calculating RMSE: Precisely! The rmse function computes the Root Mean Squared Error, a measure of how far off your predictions are from the actual values. By using it here, you're assessing the model's performance on the validation set, especially after addressing the missing values with the mean.



In [15]:
mean = df_train.total_bedrooms.mean()

X_train = prepare_X(df_train, fillna_value=mean)
w_0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, fillna_value=mean)
y_pred = w_0 + X_val.dot(w)

rmse(y_val, y_pred)

0.34056998014452095

In [16]:
X_train = prepare_X(df_train, fillna_value=0)
w_0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val, fillna_value=0)
y_pred = w_0 + X_val.dot(w)

rmse(y_val, y_pred)

0.34084790341630966

## Regularization in Linear Regression

In our linear regression model, we introduce a **regularization** term. The primary purpose of this term is to mitigate **overfitting**. Overfitting occurs when our regression model performs exceptionally well on the training data but fails to generalize effectively to new, unseen data.

### The Role of `np.eye` and Regularization

The function `np.eye` in numpy is used to generate an **identity matrix**. This identity matrix, when scaled by our regularization parameter (often denoted as \( \lambda \) or `r` in our context), is added to the **Gram matrix** \( X^T X \). The Gram matrix is the result of multiplying the transpose of matrix \( X \) with \( X \) itself, capturing the relationships between the features across all data points.

By adding the scaled identity matrix to \( X^T X \), we incorporate the regularization penalty into our weight calculation process. This penalty ensures that our model doesn't rely too heavily on any single feature and is less prone to overfitting.

### Conclusion

Regularization is a powerful technique in machine learning, ensuring our models are robust and perform consistently across both training and unseen data.


In [17]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

This is a loop for r or regularlization. If r is any of the values within the brackets then the loop will do the following,

w_0 , w will run the weights function for the X_Train dataset and y_train, and r wil be equal to the value of r set above. 
then we will run a linear regression using w_0 the X_val mnultipled by the weight. X_val could be replaced technically by any part of the feature dataset.
Then we run the rmse or the mean error so that we can see the margin of error and we print the 2 values, the regularlization and the rmse.

In [19]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    rmse_val = rmse(y_val, y_pred)
    print('%06s %0.5f' % (r, rmse_val))

     0 0.34085
 1e-06 0.34085
0.0001 0.34085
 0.001 0.34085
  0.01 0.34088
   0.1 0.34129
     1 0.34490
     5 0.34774
    10 0.34831


Is this just the entire code being placed in a function?

Not exactly a function, but rather a loop. The loop is iterating over different values of s and, for each value, it's shuffling the data, splitting it into training, validation, and test sets, training a linear regression model, making predictions, and then computing the RMSE. The entire process you previously went through for one dataset split is now being repeated for multiple splits, determined by different random seeds (s).
What is s standing for?

s stands for the random seed value used by the np.random.seed(s) function. Setting a seed ensures that the random operations, like shuffling in this case, are reproducible. By iterating over different seed values, you're essentially creating different shuffles (or permutations) of your dataset. This allows you to evaluate the model's performance across various data splits, giving a more robust understanding of its generalization capabilities.
Think of s as a key to a particular arrangement of a deck of cards. Each key (s value) will shuffle the deck in a unique but reproducible way. By trying out different keys, you're seeing how your game (or model) performs with different card arrangements.
Additional Insight:

The list rmses is collecting the RMSE values for each shuffle (seed value). At the end of the loop, you'll have an array of RMSE values, one for each seed. This can be useful to compute statistics like the average RMSE or its variance across different data splits.

In [20]:
rmses = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_train = prepare_X(df_train, fillna_value=0)
    w_0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val, fillna_value=0)
    y_pred = w_0 + X_val.dot(w)

    result = rmse(y_val, y_pred)
    print(s, result)
    
    rmses.append(result)

0 0.33773871600616767
1 0.337799935365921
2 0.33842870067644554
3 0.33200494683066273
4 0.33944518625587883
5 0.3433819705275054
6 0.33853302117632905
7 0.34687476972950265
8 0.35127368659576985
9 0.3341558266506293


In [21]:
np.std(rmses)

0.005465718180952141

In [22]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [23]:
X_train = prepare_X(df_train, fillna_value=0)
w_0, w = train_linear_regression(X_train, y_train)

X_test = prepare_X(df_test, fillna_value=0)
y_pred = w_0 + X_test.dot(w)

result = rmse(y_test, y_pred)
print(result)

0.33488435337023326
