In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.linalg as sp_la 

For this homework we will work with the Seoul Bike Data, which comes from https://archive.ics.uci.edu/ml/datasets/Seoul+Bike+Sharing+Demand. 

In [22]:
data = np.array([[1,2,3],[4,5,6]])
data[:, [0,2]]

array([[1, 3],
       [4, 6]])

In [23]:
data = np.array(np.genfromtxt('data/SeoulBikeData.csv', delimiter=',', skip_header=1, dtype=float, encoding="utf-8", usecols=[1,2,3,4,5,6,7,8,9,10]))  
columns = ["Rented Bike Count","Hour","Temperature(C)","Humidity(%)","Wind speed (m/s)","Visibility (10m)","Dew point temperature(C)","Solar Radiation (MJ/m2)","Rainfall(mm)","Snowfall (cm)"]

How many data points are in the dataset?

In [24]:
print(data.shape[0])

8760


What are the min, max, mean and standard deviation for the dependent variable rented bike count?

In [25]:
print(data[:, 0].min(), data[:, 0].max(), data[:, 0].mean(), data[:, 0].std())

0.0 3556.0 704.6020547945205 644.9606517645436


Which two independent variables are least highly correlated with the dependent variable?

In [26]:
for i in range(len(columns)):
    print(columns[i], np.corrcoef(data[:, 0], data[:, i], rowvar=True)[0,1])

Rented Bike Count 1.0
Hour 0.4102572913224859
Temperature(C) 0.538558153013979
Humidity(%) -0.19978016700089826
Wind speed (m/s) 0.12110844818838673
Visibility (10m) 0.19928029673135902
Dew point temperature(C) 0.3797881212449725
Solar Radiation (MJ/m2) 0.2618369855095913
Rainfall(mm) -0.12307395980285031
Snowfall (cm) -0.1418036499974599


If you fit a linear regression using only the independent variable most highly correlated with the dependent one, what are the slope and intercept you get?

In [27]:
def makePoly(x, polys):
    # make an empty array of size A
    A = np.zeros([x.shape[0], np.sum(polys)+1])
    A[:, 0] = np.squeeze(x[:, 0]**0)
    k = 1
    for (j, poly) in enumerate(polys):
        for i in range(1, poly+1):
        # np.squeeze is a fun function :)
        # notice this is also a third way to get that leading column of ones!
            A[:, k] = np.squeeze(x[:, j]**i)
            k += 1
    return A

def fit(data, independent, dependent, polys):
    # This is our independent variable, just one for now
    x = data[np.ix_(np.arange(data.shape[0]), independent)]

    # We add the polynomials, and a column of 1s for the intercept
    A = makePoly(x, polys)
    print(A.shape)

    # This is the dependent variable 
    y = data[:, dependent]

    # This is the regression coefficients that were fit, plus some other results
    # We use _ when we don't want to remember something a function returns
    c, _, _, _ = sp_la.lstsq(A, y)
    return c

def predict(data, independent, polys, c):
    # These are our independent variable(s)
    x = data[np.ix_(np.arange(data.shape[0]), independent)]

    # We add the polynomials, and a column of 1s for the intercept
    A = makePoly(x, polys)

    return np.dot(A, c)

In [28]:
def rsquared(y, yhat):
    if len(y) != len(yhat):
        print("Need y and yhat to be the same length!")
        return 0
    return 1 - (((y - yhat)**2).sum() / ((y - y.mean())**2).sum())

In [29]:
c = fit(data, [2], 0, [1])
print(c)

(8760, 2)
[329.95251395  29.08109899]


If you fit a linear regression using all the independent variables, what are the slope and intercept you get?

In [30]:
ind = list(range(1, len(columns)))
c = fit(data, ind, 0, [1 for x in ind])
print(c)

(8760, 10)
[ 5.48853646e+02  2.73154522e+01  2.65792502e+01 -8.81147935e+00
  6.92209593e+00  2.12873303e-02  5.41311278e+00 -7.93427651e+01
 -5.88068970e+01  2.10769823e+01]


What combination of independent variables gives you a linear regression model that has the lowest Rsquared on the test data?

What do you conclude about bike shares in Seoul
 from your analysis of this dataset?

In [32]:
(train, test) = np.split(data, [int(len(data) / 10 * 8)])
print(train.shape, test.shape)
train = np.vstack([train]*100)
print(train.shape)

(7008, 10) (1752, 10)
(700800, 10)


In [33]:
%%time
c = fit(train, [2], 0, [1])
print(c)

(700800, 2)
[305.20804102  29.43473846]
CPU times: user 60.1 ms, sys: 54 ms, total: 114 ms
Wall time: 312 ms


Bad pipe message: %s [b'\x10*\x847W\xfe\xc2\x00\x0f\xeb\xb7\x16ml\xfaD\x1f\xe5\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0', b"\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04"]
Bad pipe message: %s [b'\x08\x06\x04\x01\x05\x01\x06', b'', b'\x03\x03']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'', b'\x02']
Bad pipe message: %s [b'\x05\x02\x06']
Bad pipe message: %s [b'\x8c\xacn\x1c\xa4\xfd,_+\xf91X\xb3