<a href="https://colab.research.google.com/github/brontominds/bounce_demand/blob/v0.01/DemandPredictionPythonModel_Random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import seaborn as sns
from sklearn.utils import shuffle
import matplotlib.pyplot as plt  # For plotting graphs 
from datetime import datetime    # To access datetime 
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.optimizers import Adam,SGD,RMSprop,Adagrad



#Add date time specific columns to the dataframe
def datetime(df):
  from datetime import datetime
  df['datetime'] = pd.to_datetime(df['datetime'])
  df['year'] = df['datetime'].dt.year
  df['month']=df['datetime'].dt.month
  df['day'] = df['datetime'].dt.day 
  df['hour'] = df['datetime'].dt.hour
  df['dayofweek'] = df['datetime'].dt.dayofweek
  return df

#Normalize a column of dataframe and return mean and sd
def normalize(df,feature_name):
  mean_value = df[feature_name].mean()
  std_value = df[feature_name].std()
  df[feature_name] = (df[feature_name] - mean_value) / std_value        
  return (df, mean_value, std_value)


#One hot encode data
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data

#Drop Redundant Columns
def drop_redundantcolumns(data):
  return (data.drop(["casual","registered"], axis=1))  

#Create target variable and convert features dataframe to a numpy array
def create_targetvariable(data, fraction=0.2):
    data_y = data["count"]
    data_x = data.drop(["count"], axis=1)
    return data_x.values, data_y




Using TensorFlow backend.


In [0]:
#PLotting functions
%matplotlib inline
def plotOutliers(df):
  plt.figure(1)
  plt.subplot(121)
  sns.distplot(df); 
  plt.subplot(122)
  df.plot.box(figsize=(16,5)) 
  plt.show()
  
def plotScatter(df1, dfTarget):
  plt.scatter(df1,dfTarget,alpha=0.1,cmap='viridis')
  
def plotCorrelationHeatmap(df):
  matrix = df.corr()
  f, ax = plt.subplots(figsize=(30, 10))
  sns.heatmap(matrix, vmax=.8, square=True, cmap="Greens")
  
def plotCorrelationHeatMap1d(df):
  corrMat = df.corr()
  mask = np.array(corrMat)
  mask[np.tril_indices_from(mask)] = False
  fig, ax= plt.subplots(figsize=(60, 20))
  sns.heatmap(corrMat, mask=mask,vmax=1., square=True,annot=True)
 
  def plotCorrelationwithTarget(df):
    corr_matrix = df.corr()
    corr_target=corr_matrix["count"].sort_values(ascending=False)
    corr_target.plot.bar()
  

In [0]:
#Read Training Data
data=pd.read_csv("train.csv")
data_original=data

#Add new features of date time
data=datetime(data)
data=data.drop('datetime',axis=1)



In [0]:
#Plot graphs if required
#plotOutliers(data['atemp'])
#plotScatter(data['humidity'], data['count'])
#plotCorrelationHeatmap(data)
#plotCorrelationHeatMap1d(data)

In [0]:
#Transform Data
#Normalize weather data
data, mean_temp, sd_temp=normalize(data,["temp"])
data,mean_atemp,sd_atemp=normalize(data,["atemp"])
data,mean_humidity,sd_humidity=normalize(data,["humidity"])
data,mean_windspeed,sd_windspeed=normalize(data,["windspeed"])

#One hot-encode categorical data
data = dummy_data(data, ["season","weather","hour","dayofweek","month"]) #"day" not included because of incomplete data
#TODO: Check which all features require hot encoding, and which work better without hot encoding

In [0]:
#Shuffle data and split into training+validation, and testing
#Training and Validation data kept together as Keras will do the auto split
data=shuffle(data,random_state=1) #Seed=1 applied for ability to repeat same tests with parameters tuning

testDataSplit=0.2 #0.3 means 30% data will be taken out for test and remaining for training+validation
nrows = len(data)
training_validation_rows = int(nrows*(1-testDataSplit))
training_validation_data=data[0:training_validation_rows-1]
test_data=data[training_validation_rows:]
test_data=test_data.drop(['count'],1)

In [0]:
#Drop casual and registered users columns
training_validation_data = drop_redundantcolumns(training_validation_data)

#Separate target variable, and convert features data into a numpy array
data_x, data_y = create_targetvariable(training_validation_data)

In [0]:
#Generate Random Forest Model
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
regressor.fit(data_x, data_y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [0]:
#Save the trained model to a pickle file
import pickle
from sklearn.externals import joblib

joblib.dump(regressor,'BounceDemandPrediction_Random_forest.pkl')



['BounceDemandPrediction_Random_forest.pkl']

In [0]:
import pandas as pd
lst=[mean_temp[0],mean_humidity[0],mean_windspeed[0],mean_atemp[0],sd_temp[0],sd_humidity[0],sd_windspeed[0],sd_atemp[0]]
 
df = pd.DataFrame([lst],columns =['mean_temp', 'mean_humidity','mean_windspeed','mean_atemp','sd_temp','sd_humidity','sd_windspeed','sd_atemp']) 
df.to_csv('data_mean_std.csv',index=False)

In [0]:
#Define transformation functions
def transform_temp(temp):
  return ((temp-mean_temp[0])/sd_temp[0])

def transform_atemp(atemp):
  return ((atemp-mean_atemp[0])/sd_atemp[0])

def transform_humidity(humidity):
  return ((humidity-mean_humidity[0])/sd_humidity[0])

def transform_windspeed(windspeed):
   return ((windspeed-mean_windspeed[0])/sd_windspeed[0])

In [0]:
#Evaluate on test data

#Drop casual and registered users columns
test_data = drop_redundantcolumns(test_data)



In [0]:
scores=regressor.predict(test_data)

In [0]:
test1_loss=scores[0]
test1_mae=scores[1]
test1_mse=scores[2]

test1_mae

#Evaluate on Test Data Complete

32.13

In [0]:
#Kaggle submission
test=pd.read_csv("test.csv")
test_original=test.copy()

In [0]:
#Transform Test Data
#Add new features of date time
test=datetime(test)
test=test.drop('datetime',axis=1)


In [0]:

#Normalize weather data
test["temp"] = transform_temp(test["temp"])
test["atemp"] = transform_atemp(test["atemp"])
test["humidity"] = transform_humidity(test["humidity"])
test["windspeed"] = transform_windspeed(test["windspeed"])



In [0]:

#test, mean_temp, sd_temp=normalize(test,["temp"])
#test,mean_atemp,sd_atemp=normalize(test,["atemp"])
#test,mean_humidity,sd_humidity=normalize(test,["humidity"])
#test,mean_windspeed,sd_windspeed=normalize(test,["windspeed"])


In [0]:

#One hot-encode categorical data
test = dummy_data(test, ["season","weather","hour","dayofweek","month"]) #"day" not included because of incomplete data


In [0]:
prediction=NN_model.predict(test)

In [0]:
evaluation=test_original['datetime'].to_frame()
evaluation["count"]=prediction
evaluation["count"] = evaluation["count"].astype(int)
evaluation[:10]

In [0]:
evaluation.to_csv("submission_anupam.csv",index=False)

In [0]:

#For debugging only
#print(mean_temp, sd_temp, mean_atemp, sd_atemp, mean_humidity, sd_humidity, mean_windspeed, sd_windspeed)
print(data_x[0:2])
#


In [0]:
#For debugging only
data.describe()