## Reading data

In [1]:
# Imports.
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Read csv file into a dataframe.
df = pd.read_csv('00150001.rain.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,day,duration,hour,weekday,rain
0,0,1,105,6,1,0.0
1,1,1,92,6,1,0.0
2,2,1,93,6,1,0.0
3,3,1,94,6,1,0.0
4,4,1,148,6,1,0.0
5,5,1,96,6,1,0.0
6,6,1,108,6,1,0.0
7,7,1,107,7,1,0.0
8,8,1,107,7,1,0.0
9,9,1,103,7,1,0.0


In [3]:
df = df.drop('Unnamed: 0',1)

In [4]:
df.duration.mean()

99.937284631288762

In [5]:
# Print the feature types in our dataset.
df.dtypes

day           int64
duration      int64
hour          int64
weekday       int64
rain        float64
dtype: object

In [None]:
## Understanding data

In [6]:
# Look at correlations for all the continuous features.
df[['day', 'duration', 'hour', 'weekday','rain']].corr()

Unnamed: 0,day,duration,hour,weekday,rain
day,1.0,-0.065471,0.055742,-0.692202,0.028806
duration,-0.065471,1.0,-0.158696,0.109236,0.046049
hour,0.055742,-0.158696,1.0,-0.085978,-0.007791
weekday,-0.692202,0.109236,-0.085978,1.0,0.056339
rain,0.028806,0.046049,-0.007791,0.056339,1.0


In [7]:
# The correlation for a given pair of features
df[['rain', 'duration']].corr().as_matrix()[0,1]

0.046048552336026942

# Linear regression with package <span style="color:red">statsmodels</span>

## Simple linear regression (one descriptive feature)
### Training a model

## Multiple linear regression (using more than one feature)
### Training the model

In [8]:
#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm
# Train aka fit, a model using all continuous features.
lm = sm.ols(formula="duration ~  day + hour + weekday + rain", data=df).fit()

# Print the weights learned for each feature.
print(lm.params)

Intercept    101.880502
day            0.087984
hour          -0.378734
weekday        3.281907
rain           1.449389
dtype: float64


### Evaluating the model on training data (p-values, R-squared)

In [9]:
# Print the table with feature weights, statistical confidence (p-values, confidence intervals) 
# and goodness of fit metrics (e.g., R-squared, Log-likelihood, AIB, BIC).
# Only feature Size is found to be statistically significant (p-value is smaller than 0.05, p-value=0).
# If the 95% confidence interval includes zero, the p-value for that coefficient will be greater than 0.05.

# The most common metric to evaluate the overall fit of a linear model is the R-squared value. 
# R-squared is the reduction in error over the null model which simply predicts 
# the mean target feature value in the given dataset (e.g., in our dataset the average RentalPrice is 455.5, 
# the null model is RentalPrice = 455.5, which only has an intercept, all other weights are zero). 

# R-squared is also interpreted as the proportion of variance in the observed data 
# that is explained by the model. R-squared is between 0 and 1, and higher is better 
# because it means that more variance is explained by the model. 
# We can use R-squared to compare different models, e.g., the simple regression model with only feature Size has 
# R-squared of 0.943, while including the other continuous features leads to a model with higher R-squared of 0.955.
# This looks like a better model looking at the R-squared measure, but this could be due to 
# over-fitting the training data. We also need to make predictions on out-of-sample, also called test data,
# and then compare the results to make sure the model generalises to new data and does not only 
# capture insignificant details in the sample training data, a problem called over-fitting of the training data.
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:               duration   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     40.64
Date:                Sun, 25 Jun 2017   Prob (F-statistic):           1.74e-33
Time:                        20:06:20   Log-Likelihood:                -16789.
No. Observations:                4353   AIC:                         3.359e+04
Df Residuals:                    4348   BIC:                         3.362e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    101.8805      1.008    101.114      0.0

### Evaluating the model on training and test data
Split the dataset into a training set and a test set. Fit the model on the training set, evaluate both on the training and the test set. Compare the prediction error on both sets.

In [10]:
df.head(10)

Unnamed: 0,day,duration,hour,weekday,rain
0,1,105,6,1,0.0
1,1,92,6,1,0.0
2,1,93,6,1,0.0
3,1,94,6,1,0.0
4,1,148,6,1,0.0
5,1,96,6,1,0.0
6,1,108,6,1,0.0
7,1,107,7,1,0.0
8,1,107,7,1,0.0
9,1,103,7,1,0.0


In [11]:
# Take the first 7 rows as training data
df_train = df[:7]
df_train

Unnamed: 0,day,duration,hour,weekday,rain
0,1,105,6,1,0.0
1,1,92,6,1,0.0
2,1,93,6,1,0.0
3,1,94,6,1,0.0
4,1,148,6,1,0.0
5,1,96,6,1,0.0
6,1,108,6,1,0.0


In [12]:
# Take the last 3 rows as test data
df_test = df[7:]
df_test

Unnamed: 0,day,duration,hour,weekday,rain
7,1,107,7,1,0.0
8,1,107,7,1,0.0
9,1,103,7,1,0.0
10,1,106,7,1,0.0
11,1,117,7,1,0.0
12,1,112,7,1,0.0
13,1,115,8,1,0.0
14,1,108,8,1,0.0
15,1,110,8,1,0.0
16,1,109,8,1,0.0


## Training on df_train

In [14]:
# Train aka fit, a model using all continuous features.
lm = sm.ols(formula="duration ~  day + hour + weekday + rain", data=df_train).fit()

# Print the weights learned for each feature.
print(lm.params)

Intercept     2.695971
day           2.695971
hour         16.175824
weekday       2.695971
rain          0.000000
dtype: float64


In [15]:
# Print the detailed metrics for the trained model.
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:               duration   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       inf
Date:                Sun, 25 Jun 2017   Prob (F-statistic):                nan
Time:                        20:07:25   Log-Likelihood:                -30.323
No. Observations:                   7   AIC:                             62.65
Df Residuals:                       6   BIC:                             62.59
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.6960      0.193     13.988      0.0

  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.ess/self.df_model
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


## Testing on df_train

In [16]:
lm.predict(df_train)

0    105.142857
1    105.142857
2    105.142857
3    105.142857
4    105.142857
5    105.142857
6    105.142857
dtype: float64

In [41]:
predict_df_train = pd.DataFrame({'actual': df_train.RentalPrice, 'predicted': lm.predict(df_train)})
predict_df_train

Unnamed: 0,ActualPrice,PredictedPrice
0,320,330.884397
1,380,358.197918
2,400,410.41758
3,390,384.026604
4,385,396.060014
5,410,409.571539
6,480,475.841948


In [17]:
#Pair the actual and the predicted values
print("Actual - Predicted:\n", (df_train.duration - lm.predict(df_train)))
print("\n(Actual - Predicted) squared:\n", (df_train.duration - lm.predict(df_train))**2)

Actual - Predicted:
 0    -0.142857
1   -13.142857
2   -12.142857
3   -11.142857
4    42.857143
5    -9.142857
6     2.857143
dtype: float64

(Actual - Predicted) squared:
 0       0.020408
1     172.734694
2     147.448980
3     124.163265
4    1836.734694
5      83.591837
6       8.163265
dtype: float64


In [18]:
# Print the Mean Squared Error of the model on the training set
mse = ((df_train.duration - lm.predict(df_train))** 2).mean()
print("\nMean Squared Error:\n", mse)


Mean Squared Error:
 338.979591837


In [19]:
print("|Actual - Predicted|:\n", abs(df_train.duration - lm.predict(df_train)))

|Actual - Predicted|:
 0     0.142857
1    13.142857
2    12.142857
3    11.142857
4    42.857143
5     9.142857
6     2.857143
dtype: float64


In [20]:
# Print the Mean Absolute Error of the model on the training set
mae = abs(df_train.duration - lm.predict(df_train)).mean()
print("\nMean Absolute Error:\n", mae)


Mean Absolute Error:
 13.0612244898


## Testing on df_test

In [21]:
predict_df_test = pd.DataFrame({'actual': df_test.duration, 'predicted': lm.predict(df_test)})
predict_df_test

Unnamed: 0,actual,predicted
7,107,121.318681
8,107,121.318681
9,103,121.318681
10,106,121.318681
11,117,121.318681
12,112,121.318681
13,115,137.494505
14,108,137.494505
15,110,137.494505
16,109,137.494505


In [22]:
# Print the Mean Squared Error of the model on the test set
mse = ((df_test.duration - lm.predict(df_test))** 2).mean()
print("\nMean Squared Error:\n", mse)


Mean Squared Error:
 22141.4480997


In [23]:
# Print the Mean Absolute Error of the model on the test set
mae = abs(df_test.duration - lm.predict(df_test)).mean()
print("\nMean Absolute Error:\n", mae)


Mean Absolute Error:
 127.034058517
