In [4]:
import pandas as pd
from datetime import datetime

In [30]:
df = pd.read_csv("train.csv")

### Creating New Features

In [31]:
df["date"] = df.datetime.apply(lambda x : x.split()[0])
df["hour"] = df.datetime.apply(lambda x : x.split()[1].split(":")[0]).astype("int")
df["year"] = df.datetime.apply(lambda x : x.split()[0].split("-")[0])
df["weekday"] = df.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday())
df["month"] = df.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").month)

In [32]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,hour,year,weekday,month
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,0,2011,5,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,1,2011,5,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,2,2011,5,1
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,3,2011,5,1
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,4,2011,5,1


### Data Imputation

1. Get rid of columns with NA values
2. Impute using mean/median/mode depending on context
3. Build a simple prediction model to impute

In [33]:
from sklearn.ensemble import RandomForestRegressor

dfWind0 = df[df["windspeed"]==0]
dfWindNot0 = df[df["windspeed"]!=0]
rfModel_wind = RandomForestRegressor()
windColumns = ["season","weather","humidity","month","temp","year","atemp"]
rfModel_wind.fit(dfWindNot0[windColumns], dfWindNot0["windspeed"])

wind0Values = rfModel_wind.predict(X= dfWind0[windColumns])
dfWind0["windspeed"] = wind0Values
df = dfWindNot0.append(dfWind0)
df.reset_index(inplace=True)
df.drop('index',inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [34]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,hour,year,weekday,month
0,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,0,1,1,2011-01-01,5,2011,5,1
1,2011-01-01 10:00:00,1,0,0,1,15.58,19.695,76,16.9979,12,24,36,2011-01-01,10,2011,5,1
2,2011-01-01 11:00:00,1,0,0,1,14.76,16.665,81,19.0012,26,30,56,2011-01-01,11,2011,5,1
3,2011-01-01 12:00:00,1,0,0,1,17.22,21.21,77,19.0012,29,55,84,2011-01-01,12,2011,5,1
4,2011-01-01 13:00:00,1,0,0,2,18.86,22.725,72,19.9995,47,47,94,2011-01-01,13,2011,5,1


### Dealing with non-numerical Data

In [35]:
categoricalFeatureNames = ["season","holiday","workingday","weather","weekday","month","year","hour"]
numericalFeatureNames = ["temp","humidity","windspeed","atemp"]
dropFeatures = ['casual',"count","datetime","date","registered"]
for var in categoricalFeatureNames:
    df[var] = df[var].astype("category")

## Model Training

### Splitting into training and test set

In [60]:
df = df[pd.notnull(df['count'])].sort_values(by=["datetime"])
dfTrain = df[:-2000]
dfTest = df[-2000:]
datetimecol = dfTest["datetime"]
yLabels = dfTrain["count"]
yLablesRegistered = dfTrain["registered"]
yLablesCasual = dfTrain["casual"]

dfTrain  = dfTrain.drop(dropFeatures,axis=1)
# dfTest  = dfTest.drop(dropFeatures,axis=1)

In [61]:
dfTrain.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour,year,weekday,month
9573,1,0,0,1,9.84,14.395,81,10.80174,0,2011,5,1
9574,1,0,0,1,9.02,13.635,80,11.30089,1,2011,5,1
9575,1,0,0,1,9.02,13.635,80,11.30089,2,2011,5,1
9576,1,0,0,1,9.84,14.395,75,6.635457,3,2011,5,1
9577,1,0,0,1,9.84,14.395,75,6.635457,4,2011,5,1


In [62]:
dfTest.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,hour,year,weekday,month
7841,2012-08-12 15:00:00,3,0,0,1,31.16,34.09,40,8.9981,235,342,577,2012-08-12,15,2012,6,8
7842,2012-08-12 16:00:00,3,0,0,1,31.16,34.09,40,7.0015,213,300,513,2012-08-12,16,2012,6,8
10618,2012-08-12 17:00:00,3,0,0,1,32.8,34.85,33,8.334353,186,319,505,2012-08-12,17,2012,6,8
7843,2012-08-12 18:00:00,3,0,0,1,31.98,34.09,35,7.0015,164,327,491,2012-08-12,18,2012,6,8
7844,2012-08-12 19:00:00,3,0,0,1,30.34,32.575,40,8.9981,148,317,465,2012-08-12,19,2012,6,8


In [64]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
import numpy as np
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Initialize logistic regression model
lModel = LinearRegression()

# Train the model
# yLabelsLog = np.log1p(yLabels)
lModel.fit(X = dfTrain,y = yLabels)

# Make predictions
preds = lModel.predict(X= dfTest.drop(dropFeatures,axis=1))
# print ("RMSLE Value For Linear Regression: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))

In [65]:
preds

array([362.35959752, 368.41617625, 392.31253285, ..., 290.03758514,
       302.53846697, 298.49170528])

In [67]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(dfTest["count"], preds, multioutput='raw_values')

array([140.11214893])

In [71]:
dfTest['count'].iloc[8], preds[8]

(110, 373.6497889253369)

In [50]:
preds[1]

-53.92312549561029