# Okinawa data visitor prediction using Regression method (Keras and Tensorflow)

### Import Libraries

In [None]:
! pip install seaborn tensorflow keras

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow

### Load the Dataset

In [None]:
preprocessed_data='/mnt/lv/bidur/OkinawaVisitorPred/data/2019-04_2019-08_GyokuSendo_staypoints.csv'

df=pd.read_csv(preprocessed_data)
df['date']= pd.to_datetime(df['date'])
df.head()

### Explore Data

In [None]:
df.describe().transpose()

#### chek and remove null values , if any

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

### scatter plot

In [None]:
sns.scatterplot(data=df, x="date", y="car_count")

In [None]:
plt.figure(figsize=(5,4))
sns.distplot(df['car_count'])

### Check correlation among the variables

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(5,4))
sns.heatmap(df.corr(), annot=True)

#### Drop date as we are not using it for prediction

In [None]:
df=df.drop('date', axis=1)
df.columns

# Convert categorical/nominal data into dummy/indicator variables
Categorical variables need to be recoded into a series of variables which can then be entered into the regression model. Here days ( Monday:0 ...Sunday:6) and months ( april:4....August:8) are such variables. To include them in a regression model, we will encode them accordingly.

In [None]:
X = df.drop('car_count',axis=1)
y = df['car_count']

In [None]:

X['day'] = X['day'].map({0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday' })
X = pd.get_dummies(X, prefix='', prefix_sep='',columns=['day'])
X.head(7)

#### Now, apply for months

In [None]:
df.month.unique()

In [None]:
X['month'] = X['month'].map({1: 'Jan', 2: 'Feb',  3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec' })
X = pd.get_dummies(X, prefix='', prefix_sep='',columns=['month'])
X.head(2)

# Scaling and Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

### Scaling

*The main idea behind normalization/standardization is always the same. Variables that are measured at different scales do not contribute equally to the model fitting & model learned function and might end up creating a bias. Thus, to deal with this potential problem feature-wise normalization such as MinMax Scaling is usually used prior to model fitting.* [Source:https://bit.ly/2KTulBB]

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#  Model Creation

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dropout
import keras

model = Sequential([
     Dense(64, activation='relu', input_shape=[X_test.shape[1]]),
     #Dropout(0.5), removing dropout improved in this particular example
     Dense(64, activation='relu'),
     Dense(1)
  ])

rmsprop = RMSprop(0.001)

model.compile(loss='mean_squared_error',
                optimizer=rmsprop,
                metrics=['mean_absolute_error', 'mean_squared_error'])

### Training

In [None]:
model.fit(x=X_train,y=y_train.values,
          validation_data=(X_test,y_test.values),
          batch_size=16,epochs=100) 

# Evaluation 

### Lets check the performance curves

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()

### Predictions

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
predictions = model.predict(X_test)

In [None]:
mae=mean_absolute_error(y_test,predictions)

In [None]:
mse=mean_squared_error(y_test,predictions)

In [None]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test,predictions, squared=False)
mae,mse,rms

In [None]:
predictions

In [None]:
y_test