In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score

In [12]:
import numpy as np
class Metrics:
    @staticmethod
    def  mean_absolute_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs(y_true-predictions)))
    @staticmethod
    def mean_squared_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean((y_true-predictions)**2))
    @staticmethod
    def root_mean_squared_error(y_test, y_pred):
        return float(np.sqrt(Metrics.mean_squared_error(y_test, y_pred)))
    @staticmethod
    def mean_absolute_percentage_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs((y_true-predictions)/y_true)))
    @staticmethod
    def r_2_score( y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        mean_value = np.mean(y_test)
        return float(1 - np.sum((y_true - predictions)**2)/ np.sum((y_true - mean_value)**2))
    

In [13]:
data= pd.read_csv("../data/trip_duration_task_m.csv")
data.drop(["Unnamed: 0"], axis=1, inplace=True)


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199494 entries, 0 to 199493
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_longitude   199494 non-null  float64
 1   pickup_latitude    199494 non-null  float64
 2   dropoff_longitude  199494 non-null  float64
 3   dropoff_latitude   199494 non-null  float64
 4   trip_duration      199494 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 7.6 MB


In [15]:
y = data['trip_duration']
X = data.drop(['trip_duration'], axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_std = pd.DataFrame(standardScaler.transform(X_train), columns=X.columns)
X_test_std = pd.DataFrame(standardScaler.transform(X_test), columns=X.columns)
X_train_std.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,159595.0,159595.0,159595.0,159595.0
mean,1.90492e-14,1.161132e-13,-1.252507e-13,1.103077e-13
std,1.000003,1.000003,1.000003,1.000003
min,-10.76074,-15.8094,-17.9068,-13.96128
25%,-0.467434,-0.4738595,-0.475552,-0.4850114
50%,-0.213793,0.1106436,-0.1699326,0.07938435
75%,0.1483306,0.6091119,0.2708647,0.5458616
max,87.57687,59.72262,91.95123,51.98126


In [18]:
np.set_printoptions(suppress=True)

In [19]:
import numpy as np
class MyRidge:
    def __init__( self, learning_rate, iterations, alpha ) :
        self.learning_rate = learning_rate        
        self.iterations = iterations        
        self.alpha = alpha

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)

    def loss_func(self, x, y, w):
        return sum(y - np.dot(x, w)) ** 2  + self.alpha * (np.matmul(w, w))/X.shape[0]

    def fit(self, x, y):
        dist = np.inf
        eps = 1e-20
        X = self.transform_(x)

        w = np.zeros(X.shape[1])
        iter = 0

        while iter <= self.iterations:
            loss = self.loss_func(X, y, w)
            w = w - ((self.learning_rate * 2 * np.dot(X.T, np.dot(X, w) - y)) + ( 2 * self.alpha * w ))/X.shape[0]
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
            
            if(dist <= eps):
                break
        print(iter)
        self.w = w
        return self
        

    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [20]:
ridge = MyRidge(0.01, 1000, 1)
model = ridge.fit(X_train_std, y_train)
y_pred = ridge.predict(X_test_std)
print(f'MAE: {Metrics.mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {Metrics.mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {Metrics.root_mean_squared_error(y_test, y_pred)}')
print(f'MAPE: {Metrics.mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {Metrics.r_2_score(y_test, y_pred):.7f}')

1001
MAE: 606.0067522761593
MSE: 103698706.08110169
RMSE: 10183.256162991369
MAPE: 1.6486488504626207
R^2: 0.0020357


In [22]:
X.columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude'],
      dtype='object')

In [21]:
ridge.w

array([ 944.57170703,  195.79199074, -108.75391429,  124.48984299,
       -121.74716714])