# Linear Regression<br>

The data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data.<br>

The columns/ features are as follows: longitude, latitude, housing median age, total_rooms, total_bedrooms, population, households, median_income, median house value, ocean_proximity.<br> The goal here is to predict the median house value based on the other features.<br>

This dataset is a modified version of the California Housing dataset available from:
Luís Torgo's page (University of Porto)

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import copy
import math
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt

%matplotlib inline

## Data Loading and Preprocessing

In [18]:
class Preprocessing:
    
    #load the data from the file and split into train and test sets
    def fetch_train_test_data(self, path):
        
        raw_features_and_labels = pd.read_csv(path, sep = ",",)
        #handle categorical features before splitting the data set.
        features_labels = [ "longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", 
                   "households", "median_income", "ocean_proximity"]
        target_label = ["median_house_value"]
        features = raw_features_and_labels[features_labels]
        labels = raw_features_and_labels[target_label]
        #Get categorical features' names
        list_of_categorical_feature_names = []

        for item in features.columns:
            if features.dtypes[item] == "object":
                list_of_categorical_feature_names.append(item)
        
        features_with_categorical_features_one_hot_encoded = pd.DataFrame(self.one_hot_encode_categorical_features(features, list_of_categorical_feature_names))
        #merge the features and labels before splitting
        features_and_labels = features_with_categorical_features_one_hot_encoded.join(labels)
        train_set, test_set = train_test_split(features_and_labels, test_size = 0.2, random_state = 42)    
        return train_set, test_set
      
    
    #Handle Categorical Features
    def one_hot_encode_categorical_features(self, features, list_of_column_names_with_categorical_data ):
        
        print("Before one-hot encoding, the shape of the features: ", features.shape)
        transformer = ColumnTransformer(transformers=[("OneHot",                    # Just a name
                                         OneHotEncoder(),                           # The transformer class
                                         list_of_column_names_with_categorical_data # The column(s) to be applied on.
                                             )
                                            ],
                                            remainder='passthrough' # donot apply anything to the remaining columns
                                            )
        features_with_categorical_features_one_hot_encoded = pd.DataFrame(transformer.fit_transform(features))
        #Note that we lose column names while using the column transformer. Also, the categorical features' new 
        #columns are added to the front of the data frame, instead of where the categorical feature was.
        print("After one-hot encoding, the shape of the features: ", features_with_categorical_features_one_hot_encoded.shape)
        return features_with_categorical_features_one_hot_encoded
    
    
    
    #preprocess training data        
    def preprocess_training_data(self, training_features_and_labels):
        
        num_features = training_features_and_labels.shape[1]       
        training_features = training_features_and_labels.iloc[:, 0:num_features-1]
        training_targets = training_features_and_labels.iloc[:,num_features-1]
        
        #training_features_and_labels.shape, training_features.shape, training_targets.shape)
        #(16512, 14)                         (16512, 13)              (16512,)
        
        #normalize the features
        scaler = MinMaxScaler()
        scaler.fit(training_features)
        scaled_features = pd.DataFrame(scaler.transform(training_features))
        #bias of all ones
        bias_feature = pd.DataFrame(np.ones(len(training_features)).reshape((len(training_features),1)))    
        print("No. of features before adding the bias feature: ", training_features.shape)   
        #stack the new bias feature to the left of the regular features using the axis argument
        features_with_bias = pd.concat([bias_feature, scaled_features], axis = 1)
        print("No. of features after adding the bias feature: ", features_with_bias.shape)
        

        return scaler, features_with_bias, training_targets
                

        
    
    #preprocess testing data
    def preprocess_test_data(self, scaler, testing_features_and_labels):
        
        num_features = testing_features_and_labels.shape[1]           
        testing_features = testing_features_and_labels.iloc[:, 0:num_features-1]
        testing_targets = testing_features_and_labels.iloc[:,num_features-1]
            
        #transform the features using the parameters from the training data
        scaled_features =  pd.DataFrame(scaler.transform(testing_features))
        #bias of all ones
        bias_feature = pd.DataFrame(np.ones(len(scaled_features)).reshape((len(scaled_features),1)))

        #stack the new bias feature to the left of the regular features using the axis argument
        features_with_bias = pd.concat([bias_feature, scaled_features], axis = 1)
        return features_with_bias, testing_targets

    

## Visualization Module

In [None]:
class Visualization:
    
    def __init__(self):
        pass
    
    def display_correlation_among_features(self, features):
        pass
        
    def display_distributions_of_features(self, features):
        pass
        
        
    
    

## Vectorized Gradient Descent Module


In [20]:
class Gradient_Descent:
    
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate
        
    
    def update_parameters(self, Theta, X, Y):
        
        X_times_Theta = X.dot(Theta)
        X_time_Theta_minus_Y = X_times_Theta - Y
        X_transpose = X.T
        gradient_of_loss = X_transpose.dot(X_time_Theta_minus_Y)
        new_weights = Theta - self.learning_rate * gradient_of_loss
        return new_weights

## Regression Module

In [21]:
class Regression:
    
    def __init__(self, max_iters, features_train, targets_train, features_test, targets_test, scaler_used_for_features):
        self.writer = SummaryWriter()
        self.max_iters = max_iters
        self.features_train = features_train
        self.targets_train = targets_train
        self.features_test = features_test
        self.scaler_used_for_features = scaler_used_for_features
    
    def calculate_regression_loss(self, X, Theta, Y):
        
        X_times_Theta = X.dot(Theta)
        X_times_Theta_minus_Y = X_times_Theta - Y
        loss = (1/ (2 * len(Y))) * (X_times_Theta_minus_Y.T).dot(X_times_Theta_minus_Y)
        
        return loss
        
        
        
        
        
        
    

class Linear_Regression(Regression):
    
    def __init__(self):
        self.gradient_descent = Gradient_Descent()
                        
    def batch_gradient_descent_train(self):
        
        #Initialize random weights and biases
        weights = pd.DataFrame(np.random.randint(0,100,size=self.features_train.shape[1])).reshape(1, self.features_train.shape[1])
        
        epoch = 0
        
        while iteration < max_iters:
            print("Training Iteration No: ", iteration)
            
            loss_in_epoch = 0
            
            #Get a shuffled list of indices of training data. (Not required when using batch gradient descent though)
            shuffled_indices = np.random.permutation(len(self.features_train))
            
            weights = self.gradient_descent.update_parameters(weights, self.features_train, self.targets_train)
            
            training_loss_in_epoch = self.calculate_regression_loss(self.features_train, weights, self.targets_train)
            
            testing_loss_in_epoch = self.calculate_regression_loss(self.features_test, weights, self.targets_test)
            
            self.writer.addscalar("Linear Regression Batch GD Training Loss",training_loss_in_epoch, iter)
            self.writer.addscalar("Linear Regression Batch GD Testing Loss",testing_loss_in_epoch, iter)
            
            
            
            
            
            
            
            
            
            
            
            
            
                        
                        
    
    

## Training Module

In [19]:

raw_csv_path = r"C:\Users\Being_Aerys\PycharmProjects\Machine_Learning_Algorithms_Collection\Supervised_Methods\Linear_Regression\data\housing.csv"
preprocessing = Preprocessing()

train_set, test_set = preprocessing.fetch_train_test_data(raw_csv_path)

scaler, normalized_training_data, training_labels = preprocessing.preprocess_training_data(train_set)
#Lets summarize the information of the training data set.
normalized_training_data.head()

test_features, test_labels = preprocessing.preprocess_test_data(scaler, test_set)

max_iters = 25

linear_regression = Linear_Regression(self, max_iters, normalized_training_data, training_labels, test_features, targets_test, scaler_used_for_features)

linear_regression.batch_gradient_descent_train()

Before one-hot encoding, the shape of the features:  (20640, 9)
After one-hot encoding, the shape of the features:  (20640, 13)
No. of features before adding the bias feature:  (16512, 13)
No. of features after adding the bias feature:  (16512, 14)


It appears that one of the features "ocean_proiximity" is a categorical features. If a categorical feature is has two only two classes, they can be encoded as binary features. However, if there are more than 2 categories, they can be either label encoded or one-hot encoded. Label encoding means assigning an integer to each class. However, consider a case where the feature represents the color of the eyes of an individual. If the labelling is done as black eyes --> 1, brown eyes --> 1, blue eyes --> 2, green eyes --> 3, the algorithm that takes in this information will interpret as one color being "larger" than the other color, which is not what we intend to tell. Hence, in such cases, one-hot encoding comes to our rescue which does not present such bias to the algorithm.