In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.dummy import DummyRegressor
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols
%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
full_df = train_df.append(test_df)

In [4]:
full_df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [5]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17379 entries, 0 to 6492
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    17379 non-null  object 
 1   season      17379 non-null  int64  
 2   holiday     17379 non-null  int64  
 3   workingday  17379 non-null  int64  
 4   weather     17379 non-null  int64  
 5   temp        17379 non-null  float64
 6   atemp       17379 non-null  float64
 7   humidity    17379 non-null  int64  
 8   windspeed   17379 non-null  float64
 9   casual      10886 non-null  float64
 10  registered  10886 non-null  float64
 11  count       10886 non-null  float64
dtypes: float64(6), int64(5), object(1)
memory usage: 1.7+ MB


In [6]:
full_df.set_index('datetime', inplace=True)
full_df.index = pd.to_datetime(full_df.index)
full_df_sum = full_df.resample('1D').sum()
full_df_mean = full_df.resample('1D').mean()

In [7]:
full_df_mean['count'] = full_df_sum['count']
full_df_mean['registered'] = full_df_sum['registered']
full_df_mean['casual'] = full_df_sum['casual']


In [8]:
full_df = full_df_mean
full_df.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01,1.0,0.0,0.0,1.583333,14.110833,18.18125,80.583333,10.749871,331.0,654.0,985.0
2011-01-02,1.0,0.0,0.0,1.956522,14.902609,17.686957,69.608696,16.652122,131.0,670.0,801.0
2011-01-03,1.0,0.0,1.0,1.0,8.050909,9.470227,43.727273,16.636709,120.0,1229.0,1349.0
2011-01-04,1.0,0.0,1.0,1.043478,8.2,10.606087,59.043478,10.739809,108.0,1454.0,1562.0
2011-01-05,1.0,0.0,1.0,1.0,9.305217,11.463478,43.695652,12.5223,82.0,1518.0,1600.0


In [9]:
corr_df = full_df.corr().abs().stack().reset_index().sort_values(0, ascending = False)

In [10]:
corr_df[(corr_df[0]>.75) & (corr_df[0]<1)]

Unnamed: 0,level_0,level_1,0
49,temp,atemp,0.991702
59,atemp,temp,0.991702
119,count,registered,0.981032
109,registered,count,0.981032
118,count,casual,0.797506
98,casual,count,0.797506


In [11]:
X = full_df.drop(['registered', 'casual', 'atemp', 'count'], axis = 1)
y = full_df['count']

In [12]:
mlr_model = ols(formula='y~X', data=full_df).fit()
mlr_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.099
Model:,OLS,Adj. R-squared:,0.09
Method:,Least Squares,F-statistic:,11.3
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,1.23e-13
Time:,17:01:52,Log-Likelihood:,-6764.4
No. Observations:,731,AIC:,13540.0
Df Residuals:,723,BIC:,13580.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3109.6056,630.820,4.929,0.000,1871.148,4348.063
X[0],293.0552,92.555,3.166,0.002,111.346,474.764
X[1],-47.7723,581.563,-0.082,0.935,-1189.527,1093.982
X[2],-28.7111,210.048,-0.137,0.891,-441.087,383.665
X[3],-105.4266,306.618,-0.344,0.731,-707.395,496.542
X[4],74.8037,13.809,5.417,0.000,47.694,101.913
X[5],-29.8612,9.664,-3.090,0.002,-48.834,-10.888
X[6],-36.1202,19.807,-1.824,0.069,-75.006,2.766

0,1,2,3
Omnibus:,419.715,Durbin-Watson:,0.313
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.474
Skew:,-0.063,Prob(JB):,5.98e-10
Kurtosis:,1.826,Cond. No.,473.0


In [13]:
full_df = full_df[['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed', 'count']]
full_df

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,humidity,windspeed,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-01,1.0,0.0,0.0,1.583333,14.110833,80.583333,10.749871,985.0
2011-01-02,1.0,0.0,0.0,1.956522,14.902609,69.608696,16.652122,801.0
2011-01-03,1.0,0.0,1.0,1.000000,8.050909,43.727273,16.636709,1349.0
2011-01-04,1.0,0.0,1.0,1.043478,8.200000,59.043478,10.739809,1562.0
2011-01-05,1.0,0.0,1.0,1.000000,9.305217,43.695652,12.522300,1600.0
...,...,...,...,...,...,...,...,...
2012-12-27,1.0,0.0,1.0,1.666667,10.420833,65.291667,23.458933,0.0
2012-12-28,1.0,0.0,1.0,1.708333,10.386667,59.000000,10.416546,0.0
2012-12-29,1.0,0.0,0.0,2.041667,10.386667,75.291667,8.333683,0.0
2012-12-30,1.0,0.0,0.0,1.333333,10.489167,48.333333,23.500529,0.0


In [14]:
full_df['count'].isna().sum()

0

In [17]:
_train_df = full_df[full_df['count'] > 0]

In [18]:
_train_df.shape

(456, 8)

In [19]:
_test_df = full_df[full_df['count'] == 0]

In [20]:
_test_df.shape

(275, 8)

In [22]:
_test_df['count'] = np.where(_test_df['count'] == 0, np.nan, _test_df['count'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [27]:
_X = _train_df.drop(['count'], axis = 1)
_y = _train_df['count']

In [111]:
X_train, X_test, y_train, y_test = train_test_split(_X, _y, random_state=42, test_size=0.2)

In [115]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
dummy_regr.predict(X_train)
dummy_regr.score(X_train, y_train)

0.0

In [114]:
dummy_regr.predict(X_test)
dummy_regr.score(X_test, y_test)

-0.0007967417736325366

In [28]:
poly_2 = PolynomialFeatures(degree=2, interaction_only = True, include_bias=False)
poly2_data = poly_2.fit_transform(_X)
poly2_columns = poly_2.get_feature_names(_X.columns)
df_poly2 = pd.DataFrame(poly2_data, columns = poly2_columns)
df_poly2

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,season holiday,season workingday,season weather,...,workingday weather,workingday temp,workingday humidity,workingday windspeed,weather temp,weather humidity,weather windspeed,temp humidity,temp windspeed,humidity windspeed
0,1.0,0.0,0.0,1.583333,14.110833,80.583333,10.749871,0.0,0.0,1.583333,...,0.000000,0.000000,0.000000,0.000000,22.342153,127.590278,17.020629,1137.097986,151.689636,866.260425
1,1.0,0.0,0.0,1.956522,14.902609,69.608696,16.652122,0.0,0.0,1.956522,...,0.000000,0.000000,0.000000,0.000000,29.157278,136.190926,32.580238,1037.351153,248.160054,1159.132474
2,1.0,0.0,1.0,1.000000,8.050909,43.727273,16.636709,0.0,1.0,1.000000,...,1.000000,8.050909,43.727273,16.636709,8.050909,43.727273,16.636709,352.044298,133.940632,727.477916
3,1.0,0.0,1.0,1.043478,8.200000,59.043478,10.739809,0.0,1.0,1.043478,...,1.043478,8.200000,59.043478,10.739809,8.556522,61.610586,11.206757,484.156522,88.066431,634.115661
4,1.0,0.0,1.0,1.000000,9.305217,43.695652,12.522300,0.0,1.0,1.000000,...,1.000000,9.305217,43.695652,12.522300,9.305217,43.695652,12.522300,406.597543,116.522724,547.170065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,4.0,0.0,0.0,1.375000,13.290833,65.041667,7.125450,0.0,0.0,5.500000,...,0.000000,0.000000,0.000000,0.000000,18.274896,89.432292,9.797494,864.457951,94.703168,463.451144
452,4.0,0.0,0.0,2.041667,14.862500,83.875000,6.749692,0.0,0.0,8.166667,...,0.000000,0.000000,0.000000,0.000000,30.344271,171.244792,13.780620,1246.592187,100.317292,566.130389
453,4.0,0.0,1.0,2.125000,16.126667,90.708333,6.583308,0.0,4.0,8.500000,...,2.125000,16.126667,90.708333,6.583308,34.269167,192.755208,13.989530,1462.823056,106.166819,597.160927
454,4.0,0.0,1.0,1.208333,16.844167,66.625000,14.834079,0.0,4.0,4.833333,...,1.208333,16.844167,66.625000,14.834079,20.353368,80.505208,17.924512,1122.242604,249.867702,988.320524


In [29]:
_X = df_poly2.copy()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(_X, _y, random_state=42, test_size=0.2)

In [73]:
selector = SelectKBest(f_regression, k=5)
selector.fit(X_train, y_train)

  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


SelectKBest(k=5, score_func=<function f_regression at 0x7fe0479fee60>)

In [74]:
selected_columns = X_train.columns[selector.get_support()]

In [75]:
lm_kbest = LinearRegression()
lm_kbest = lm_kbest.fit(X_train[selected_columns], y_train)
y_train_kbest = lm_kbest.predict(X_train[selected_columns])
np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))

1286.7560441662888

In [76]:
y_kbest = lm_kbest.predict(X_test[selected_columns])
np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

1153.5284484015015

### LassoCV

In [100]:
lassoCV_model = LassoCV()
lassoCV_model.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [101]:
y_train_lassocv_pred = lassoCV_model.predict(X_train)
y_test_lassocv_pred = lassoCV_model.predict(X_test)

In [102]:
np.sqrt(metrics.mean_squared_error(y_train, y_train_lassocv_pred))

1340.5021901733237

In [103]:
np.sqrt(metrics.mean_squared_error(y_test, y_test_lassocv_pred))

1242.6436107565237

In [104]:
r2 = lassoCV_model.score(X_train, y_train)

In [94]:
r2

0.4885069849319463