In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split


In [2]:

class OurLinearRegression:
    def _prepare_inputs(self, X):
        ones = np.ones((X.shape[0], 1), dtype=X.dtype)
        return np.concatenate((ones, X), axis=1)

    def fit(self, X, y):
        X = self._prepare_inputs(X)
        X_transpose = X.transpose()
        self.w =  np.linalg.solve( np.matmul(X_transpose, X), np.matmul(X_transpose, y)  ) 
        return self
    
    def predict(self, X):
        X = self._prepare_inputs(X) 
        return np.matmul( X , self.w)


def Root_Mean_Squared_Logarithmic_Error(true, predicted):
    return np.sqrt(np.mean(np.square(np.log(predicted + 1) - np.log(true + 1))))

In [3]:
df = pd.read_csv("../Data/train_data_after_cleaning.csv")

y = df.trip_duration
X = df.drop(["trip_duration"],axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)
# Define the model. Set random_state to 1


# fit your model

X_train= np.array(X_train)
X_valid=np.array(X_valid)
y_train=np.array(y_train)
y_valid=np.array(y_valid)

our_model = OurLinearRegression().fit(X_train, y_train)
our_model.fit(X_train,y_train)
y_train_predict = our_model.predict(X_train)
training_error = Root_Mean_Squared_Logarithmic_Error(y_train, y_train_predict)
print(f"Training Error: {training_error} (RMS: {training_error**0.5})")
y_test_predict = our_model.predict(X_valid)
testing_error = Root_Mean_Squared_Logarithmic_Error(y_valid, y_test_predict)
print(f"Testing Error: {testing_error} (RMS: {testing_error**0.5})")

Training Error: 0.6810744349005731 (RMS: 0.8252723398373248)
Testing Error: 0.679421691103915 (RMS: 0.8242703992646557)


In [4]:
filename = f'linear_regression_model.sav'
pickle.dump(our_model, open(filename, 'wb'))