In [20]:
import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

In [21]:
@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))

In [22]:
def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['stop_passengers'].values.squeeze()

    # Get the input values
    features = ['stop_lat', 'stop_long']
    X = df[features].values.squeeze()
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y


# Load the cleaned data
path = './data/cleaned_HSL_data.csv'
X_train, X_test, y_train, y_test = load_data(path)

In [23]:
def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "models/lin_reg.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = LinearRegression()
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("Linear Regression Training set MSE: {}".format(mean_squared_error(y_train, y_pred)))
    print("=================================================================")
    
    
    if to_split:
        y_pred = model.predict(X_test)
        print("Linear Regression Test set MSE: {}".format(mean_squared_error(y_test, y_pred)))
        print("=================================================================")
        
        
# Train our model
train(path)

Duration of [Training...]: 0:00:00.001913
Linear Regression Training set MSE: 1648780.277135291
Linear Regression Test set MSE: 874116.5471934967


In [24]:
import math
print(math.sqrt(874116))

934.9417094129452
