In [1]:
#Define dependencies.
import pandas as pd
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import scipy
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.cluster import KMeans

In [2]:
#Import datasets.
restaurant_train = pd.read_csv('train.csv')
restaurant_test = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

In [None]:
#Let's see what columns we have available, and if they coincide with those provided on kaggle.
restaurant_train.columns.values

In [None]:
#head view of dataset
restaurant_train.head()

In [None]:
#Filter dataset to city and revenue columns.
rev_city = restaurant_train[['City','revenue']]

In [None]:
#Generate a list of unique cities to be used for the revenue filter.
city = rev_city['City'].drop_duplicates().sort_values().tolist()

In [None]:
#Group revenues by city.
tcity = rev_city.groupby('City')

In [None]:
#Generate list of revenues for each unique city.
rev_city = [sum(tcity.get_group(i)['revenue']) for i in city]

In [None]:
#Generate horizontal bar plot to see which city generates the most revenue.
plt.rcdefaults()
fig, ax = plt.subplots(figsize=(10,6))
y_pos = np.arange(len(city))

ax.barh(y_pos,rev_city, color ='r')
ax.set_ylabel('City')
ax.set_yticks(y_pos)
ax.set_yticklabels(city)
ax.invert_yaxis()
ax.set_xlabel('Revenue')
ax.set_title('Which cities generates the most revenue?')

plt.show()

In [None]:
#Store the necessary columns for this part.
rev_type = restaurant_train[['Type','revenue']]

In [None]:
#Consolidate the 'Type' features.
rtype = rev_type['Type'].drop_duplicates().tolist()

In [None]:
#Group the values by 'Type'
ttype = rev_type.groupby('Type')

In [None]:
#Sum and store the revenues by 'Type'
rev_type = [sum(ttype.get_group(i)['revenue']) for i in rtype]

In [None]:
#Store the complete 'Type' names.
rtype = ['Inline', 'Food Court', 'Drive Thru']

In [None]:
#Generate horizontal bar plot to see which restaurant type generates the most revenue.
plt.rcdefaults()
fig, ax = plt.subplots(figsize=(9,3))
y_pos = np.arange(len(rtype))

ax.barh(y_pos,rev_type,color='r')
ax.set_ylabel('Restaurant Type')
ax.set_yticks(y_pos)
ax.set_yticklabels(rtype)
ax.invert_yaxis()
ax.set_xlabel('Revenue')
ax.set_title('Which restaurant type generates the most revenue?')

plt.show()

In [None]:
#Concatenate both datasets.
tot = restaurant_train

In [None]:
#Store months and years in lists.
month = [i.split('/')[0] + '/' for i in tot['Open Date']]
year = [i.split('/')[2] for i in tot['Open Date']]

In [None]:
#Generate a new dataframe for this part of the exploration.
odate = pd.DataFrame([x+y for x,y in zip(month,year)], columns = ['date']).sort_values(by=['date'])
odate = odate.reset_index(drop=True)

In [None]:
#Transform 'date' column to datetime format in order to accurately sort the dates, 
#then send it back as a string.
odate['date'] = pd.to_datetime(odate['date'])
odate = odate.sort_values(by='date').reset_index(drop=True)
odate['date'] = [str(i)[:7] for i in odate['date']]

In [None]:
#Store the counts for each date considered, preparing it for a bar plot.
date_count = [i for i in odate['date'].value_counts().sort_index()]

In [None]:
#Minimize dataset to one entry per date to match the corresponding counts previously calculated.
odate = odate.drop_duplicates().reset_index(drop=True)

In [None]:
#Import the counts into the date subset dataset.
odate['date_count'] = date_count

In [None]:
#Display the trend in a barplot.
plt.figure(figsize=(23,8))
plt.bar(odate['date'],odate['date_count'],color='r')
plt.title('When were the restaurants mostly launched?')
plt.xticks(rotation='45')
plt.show()

#### Check if data revenue is normalized

In [None]:
scipy.stats.probplot(restaurant_train['revenue'], dist = 'norm', plot=plt)
plt.show()

In [None]:
scipy.stats.probplot(np.log(restaurant_train['revenue']), dist='norm', plot=plt)
plt.show()

#### Data Preparation before using linear regression for training

In [None]:
#First column that needs to be transformed from a categorical variable to a numerical one.
restaurant_train['City Group'].drop_duplicates()

In [None]:
#Replace the column values with 0 if it is a 'Big City' data point, otherwise place a 1, 
#and store these values into the same column.
restaurant_train['City Group'] = [1 if i == 'Other' else 0 for i in restaurant_train['City Group'].tolist()]

In [None]:
#Set the dummy variables for the 'type' column.
restaurant_train['Type'] = [int(3) if k == 'DT' else k for k in [int(2) if j == 'IL' else j for j in [int(1) if i == 'FC' else i for i in restaurant_train['Type']]]]

In [None]:

#One last thing to consider is the date, 
#and since this is a string, I will cut it down to just the year and transform it to an int.
restaurant_train[['month','day','year']] = restaurant_train['Open Date'].str.split('/',expand=True)
restaurant_train['year'] = [int(i) for i in restaurant_train['year']]
restaurant_train['month'] = [int(i) for i in restaurant_train['month']]
restaurant_train['day'] = [int(i) for i in restaurant_train['day']]


In [None]:
#Dataset is ready for PCA section and beyond.
restaurant_train = restaurant_train.iloc[:,3:]
restaurant_train.head()

In [None]:
#Let's make a copy of the test set.
res_test = restaurant_test.copy()

In [None]:
#Now let's start forming our dataset to match that of the training set, 
#with the exception of the revenue column for the analysis.
res_test[['month','day','year']] = res_test['Open Date'].str.split('/',expand=True)
res_test['year'] = [int(i) for i in res_test['year']]
res_test['month'] = [int(i) for i in res_test['month']]
res_test['day'] = [int(i) for i in res_test['day']]

In [None]:
#Replicate dummy variable for the city group variable.
res_test['City Group'] = [1 if i == 'Other' else 0 for i in res_test['City Group']]

In [None]:
#Replicate the dummy variable process for the 'type' column.
res_test['Type'] = [int(3) if k == 'DT' else k for k in [int(2) if j == 'IL' else j for j in [int(1) if i == 'FC' else i for i in res_test['Type']]]]

In [None]:
#Since this dataset is the one that contains the mobile option (MB), 
#then I will be setting this as null in order to find and compensate for it in the next steps.
res_type = [np.nan if i == 'MB' else i for i in res_test['Type']]

In [None]:
#Substitute back in for the 'type' column in the main dataframe.
res_test['Type'] = res_type

In [None]:
#Let's see what it looks like!
knni_test = res_test.iloc[:,3:]
knni_test.head()

In [None]:
#Define the KNNImputer function.
imputer = KNNImputer()

In [None]:
#Send data to the KNNImputer
imputer.fit(knni_test)

In [None]:
#Store the transformed data once sent to the KNNImputer
knni_sol = imputer.transform(knni_test)

In [None]:
#Format the resultant dataframe from the data received.
knni_test = pd.DataFrame(knni_sol, columns = [i for i in knni_test.columns.values])

In [None]:
#Now that the dataframe is updated to compensate for the mobile (MB) option of the 'type' column,
#let's quickly see what values are available and if they coincide with those available on the training dataset.
knni_test['Type'].drop_duplicates()

In [None]:
iven that the values available are floats with decimal values in between the values allowed,
#this will be fixed by setting boundaries that will output corresponding dummy variables to those in the training set.
knni_test['Type'] = [2.0 if (j >= 1.5 and j < 2.5) else j for j in [1 if (i > 1 and i < 1.5) else i for i in knni_test['Type']]]

In [None]:
knni_test.head()

In [None]:
res_train = pd.read_csv('train.csv')

In [None]:
len(res_train['City'].drop_duplicates())

In [None]:
len(res_test['City'].drop_duplicates())

In [None]:
rtrain_y = restaurant_train['revenue'].copy()
rtrain_y

In [None]:
#Take the logarithm of the 'revenue' column to generate better results.
restaurant_train['revenue'] = np.log(restaurant_train['revenue'])

In [None]:
#Store the response variable for the training set.
rtrain_ylog = restaurant_train['revenue']
rtrain_ylog

In [None]:
#Update the feature dataset.
restaurant_train.drop(columns=['revenue'], inplace=True)

In [None]:
#Generate a StandardScaler instance.
scaler = StandardScaler()
rtrain_x = scaler.fit_transform(restaurant_train)
rtrain_x = pd.DataFrame(rtrain_x, columns = [i for i in restaurant_train.columns.values])
rtrain_x.head()

rtest_x = scaler.fit_transform(knni_test)
rtest_x = pd.DataFrame(rtest_x, columns = [i for i in knni_test.columns.values])
rtest_x.head()

#### Analysis
#### Principal Component Analysis
Once we have the data normalized, we can perform the PCA to reduce the amount of dimensions available. Using the sci-kit learn library from python, the steps of this process go as follows:

In [None]:
#Instantiate the Principal Component Analysis (PCA) with 10 features as our target.
pca_train = PCA(n_components = 10)
pca_test = PCA(n_components = 10)

#Fit the feature data to the PCA.
pca_train.fit(rtrain_x)
pca_test.fit(rtest_x
             
#Let's see how they are weighted.
print(pca_train.explained_variance_ratio_)
print(pca_test.explained_variance_ratio_)

In [None]:
#Fully transform the feature data to the PCA features.
pca_train_rcomp = pca_train.transform(rtrain_x)
pca_test_rcomp = pca_test.transform(rtest_x)

#Generate and display the dataframe with the main PCA components generated.
pca_train = pd.DataFrame(data=pca_train_rcomp, columns = ['PCA '+str(i) for i in range(10)])
pca_train.head()

#Generate and display the dataframe with the main PCA components generated.
pca_test = pd.DataFrame(data=pca_test_rcomp, columns = ['PCA '+str(i) for i in range(10)])
pca_test.head()

#### Linear Regression

Arriving at the final portion of the analysis, the revenue will be predicted using a linear regression method. Considering the different combinations available for both the response variables and the features available(x), the results in the end speak for themselves and hint at the non-normalized and non-PCA method to be the best performing 𝑅^2

In [None]:
## Choice 1: PCA training set and log applied response variable.

In [None]:
lr_model = linear_model.LinearRegression()
lr_model.fit(pca_train,rtrain_ylog)
y_pred = lr_model.predict(pca_test)

print(y_pred)
print(lr_model.score(pca_train,rtrain_ylog))

In [None]:
## Choice 2: PCA training set and normal response variable.

In [None]:
lr_mod1 = linear_model.LinearRegression()
lr_mod1.fit(pca_train, rtrain_y)
y_pred1 = lr_mod1.predict(pca_test)

print(y_pred1)
print(lr_mod1.score(pca_train, rtrain_y))

In [None]:
## Choice 3: Regular training set and normal response variable.

In [None]:
lr_mod2 = linear_model.LinearRegression()
lr_mod2.fit(restaurant_train, rtrain_y)
y_pred2 = lr_mod2.predict(knni_test)

print(y_pred2)
print(lr_mod2.score(restaurant_train, rtrain_y))

In [None]:
#Display predicted values for the 100000 rows in the test set.
[i for i in y_pred2]