<a href="https://colab.research.google.com/github/bobovnii/progressive_growing_of_gans/blob/master/JDAtask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
import pandas as pd
import numpy as np
import seaborn as sns                       
import matplotlib.pyplot as plt             
%matplotlib inline     
sns.set(color_codes=True)
#import pickle
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import math
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib



In [0]:
def TestDataset(df_test):
  duplicate_rows_df = df_test[df_test.duplicated()]
  print("number of duplicate rows: ", duplicate_rows_df.shape,"if # of duplicate rows > 0, you may use df.drop_duplicates()")

  #test for nan values
  assert not df_test.isnull().values.any(), 'nan value is found in the input dataset, you may use df.dropna()' 

  #test for coloumns matching
  check = df_test.columns == ['instant', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt']
  assert not (False in check), 'columns names dont match'

  #test for non-numerical values
  assert not (False in df_test.applymap(np.isreal).all(1).values), 'non-numerical values were found, you may use df[~df.applymap(np.isreal).all(1)] to check them' 

  #check for consistency casual+gegistered == cnt
  assert not (False in (df.registered+df.casual == df.cnt).values), 'casual+gegistered != cnt'

  #check for consistency: holiday is alway not workig day
  assert not (False in (df.holiday + df.workingday < 2).values), 'holiday is workig day!!!'



In [0]:
def DataExploration(inputDataset, pathForPictures):
  print('running data exploration')
  df = pd.read_csv(inputDataset)
  df = df.drop(['dteday'], axis=1)

  TestDataset(df)
  DataSetFeatures = df.describe().loc[['min','max', 'mean','50%'],:]
  print("DataSet Features: ", DataSetFeatures)

  #check for anomalies
  #fig=sns.boxplot(x=df['cnt'])
  #fig.savefig('boxplotOfBikesNumber.png')

  df2= df.drop('instant',axis=1)
  fig=df2.groupby('yr').boxplot(fontsize=20,rot=90,figsize=(20,10),patch_artist=True)
  plt.savefig(pathForPictures+'boxplotAllForBothYears.png')
  plt.clf()


  Q1 = df.quantile(0.25)
  Q3 = df.quantile(0.75)
  IQR = Q3 - Q1

  #possible cleaning in case of many anomalies
  #df=df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)].shape

  print('source of many anomalies is windspeed, check boxplot')

  fig=sns.boxplot(x=df['windspeed'])
  plt.savefig(pathForPictures+'boxplotWindspeed.png')
  plt.clf()



  #variable correlations

  plt.figure(figsize=(15,10))
  c= df.corr()
  fig=sns.heatmap(c,cmap="BrBG",annot=True)
  plt.savefig(pathForPictures+'HeatMap.png')
  plt.clf()


  #plot hist and check balance
  plt.figure(figsize=(15,10))
  fig=df.hr.value_counts().nlargest(24).plot(kind='bar', figsize=(10,5))
  plt.title("Number of inputs by hours")
  plt.ylabel('Number of inputs')
  plt.xlabel('hour');
  plt.savefig(pathForPictures+'HoursHist.png')
  plt.clf()

  plt.figure(figsize=(15,10))
  fig=df.weathersit.value_counts().nlargest(24).plot(kind='bar', figsize=(10,5))
  plt.title("Number of inputs by weather")
  plt.ylabel('Number of inputs')
  plt.xlabel('weather');
  plt.savefig(pathForPictures+'WeatherHist.png')
  plt.clf()

  plt.figure(figsize=(15,10))
  df.yr.value_counts().nlargest(24).plot(kind='bar', figsize=(10,5))
  plt.title("year")
  plt.ylabel('Number of inputs')
  plt.xlabel('year');
  plt.savefig(pathForPictures+'YearHist.png')
  plt.clf()

  #scatter plots:

  df2a = df.drop(['cnt','casual','registered','instant'],axis=1)
  df2a.index = df['cnt']
  fig=df2a.plot(subplots=True, style='.', figsize=(10,40))
  plt.legend(loc='best')
  #plt.show()
  plt.savefig(pathForPictures+'AllScatterPlots.png')
  plt.clf()

  #better scatter plots for some importnat variables

  df.temp.head(5)
  t_min= -8
  t_max= +39
  coloumn=(df.temp*(t_max-t_min))+t_min

  fig, ax = plt.subplots(figsize=(10,6))
  ax.scatter(coloumn, df['cnt'])
  ax.set_xlabel('temperature')
  ax.set_ylabel('total bike number')
  plt.savefig(pathForPictures+'ScatterPlotTemperature.png')
  plt.clf()


  fig, ax = plt.subplots(figsize=(10,6))
  ax.scatter(df['weekday'], df['cnt'])
  ax.set_xlabel('weekday')
  plt.xticks((0,1,2,3,4,5,6), ('Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'))
  ax.set_ylabel('total bike number')
  plt.savefig(pathForPictures+'ScatterPlotDays.png')
  plt.clf()



In [0]:
def LabelTransformation(y_in):
  y_out=1/(0.1+y_in/y_in.max())
  return y_out

In [0]:
def ReLU(x):
    return x * (x > 0)

In [0]:
def InverseLabelTransformation(y_in, maxValue):
  y_out=ReLU((1/y_in)-0.1)*maxValue
  return y_out

In [0]:
def TrainAndTest(inputDataset, path):
  print('running training and testing')
  df = pd.read_csv(inputDataset)
  df = df.drop(['dteday'], axis=1)

  TestDataset(df)
  #cleaning data, removing redundent info and separting the features from the labels
  feat = df.drop(columns=['cnt','instant','season','casual','registered','cnt','yr'],axis=1)
  label = df['cnt']

  #variable transformation
  sc_x = MinMaxScaler()
  label_norm = LabelTransformation(label)
  feat_norm = sc_x.fit_transform(feat)

  #test and training splitting
  X_train, X_test, y_train, y_test = train_test_split(feat_norm, label_norm, test_size=0.3)

  #training
  Boosting_Regression = ensemble.GradientBoostingRegressor(n_estimators = 600, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
  Boosting_Regression.fit(X_train, y_train)

  #testing
  print('score on test dataset: ',Boosting_Regression.score(X_test,y_test))
  print('score on train dataset: ',Boosting_Regression.score(X_train,y_train))


  # save the model to disk
  filename = path+'Boosting_Regression_model.sav'



  #joblib.dump(pipeline, 'transform_predict.joblib')


  pickle.dump(Boosting_Regression, open(filename, 'wb'))

  y_pred=Boosting_Regression.predict(X_test)
  y_pred_train=Boosting_Regression.predict(X_train)

  binwidth=50
  plt.hist(InverseLabelTransformation(y_pred_train,700),bins=range(0, label.max() + binwidth, binwidth), label="train",density='true')
  plt.hist(InverseLabelTransformation(y_pred,700), lw=4,color='r',histtype='step',bins=range(0, label.max() + binwidth, binwidth), label="test",density='true')
  plt.title("Test vs Train model prediction normilized distribution")
  plt.ylabel('a.u.')
  plt.xlabel('Number of bikes');
  plt.legend()

  plt.savefig(path+'TestVsTrainComparison.png')
  plt.clf()

  binwidth=50
  plt.hist(InverseLabelTransformation(y_test,700),bins=range(0, label.max() + binwidth, binwidth), label="real")
  plt.hist(InverseLabelTransformation(y_pred,700), lw=4,color='r',histtype='step',bins=range(0, label.max() + binwidth, binwidth), label="predicted")
  plt.title("Test number of rented bikes")
  plt.ylabel('count')
  plt.xlabel('Number of bikes');
  plt.legend()

  plt.savefig(path+'PredictedVsRealComparison.png')
  plt.clf()

  print('mean absolute error: ',metrics.mean_absolute_error(InverseLabelTransformation(y_test,700), InverseLabelTransformation(y_pred,700)))
  print('median absolute error: ',metrics.median_absolute_error(InverseLabelTransformation(y_test,700), InverseLabelTransformation(y_pred,700))) 	



  return Boosting_Regression






In [0]:
def getPrediction(model, df):
  df = df.drop(['dteday'], axis=1)

  TestDataset(df)
  #cleaning data, removing redundent info and separting the features from the labels
  feat = df.drop(columns=['cnt','instant','season','casual','registered','cnt','yr'],axis=1)

  #variable transformation
  sc_x = MinMaxScaler()
  feat_norm = sc_x.fit_transform(feat)

  #
  y_out=model.predict(feat_norm)
  print(y_out)
  y_out=InverseLabelTransformation(y_out,700)

  return y_out







In [0]:
#for running on google colab
from google.colab import drive
drive.mount('/content/drive')

In [112]:
inputDataset='hour.csv'
# Data axploration and control plots
DataExploration(inputDataset,'drive/My Drive/JDAtask/')
# Training, testing and saving model
TrainAndTest(inputDataset,'drive/My Drive/JDAtask/')

#loading model
filename='drive/My Drive/JDAtask/Boosting_Regression_model.sav'

loaded_model = pickle.load(open(filename, 'rb'))
dataset = pd.read_csv(inputDataset)

#getting prediction

getPrediction(loaded_model,dataset.loc[df.instant == 100])








running data exploration
number of duplicate rows:  (0, 16) if # of duplicate rows > 0, you may use df.drop_duplicates()
DataSet Features:        instant   season        yr  ...      casual  registered         cnt
min       1.0  1.00000  0.000000  ...    0.000000    0.000000    1.000000
max   17379.0  4.00000  1.000000  ...  367.000000  886.000000  977.000000
mean   8690.0  2.50164  0.502561  ...   35.676218  153.786869  189.463088
50%    8690.0  3.00000  1.000000  ...   17.000000  115.000000  142.000000

[4 rows x 16 columns]
source of many anomalies is windspeed, check boxplot
running training and testing
number of duplicate rows:  (0, 16) if # of duplicate rows > 0, you may use df.drop_duplicates()
score on test dataset:  0.9391107951228764
score on train dataset:  0.9673476830121847
mean absolute error:  30.40638556172801
median absolute error:  17.328310789626602
number of duplicate rows:  (0, 16) if # of duplicate rows > 0, you may use df.drop_duplicates()
[8.38101982]


array([13.52205523])

<Figure size 1440x720 with 0 Axes>

<Figure size 1080x720 with 0 Axes>

<Figure size 720x360 with 0 Axes>

<Figure size 720x360 with 0 Axes>

<Figure size 720x360 with 0 Axes>

<Figure size 720x2880 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>