# Final project for CMPE 343-Business Intelligence and Applied Analytics 

### In this project I used the Kaggle Competition https://www.kaggle.com/c/restaurant-revenue-prediction 


-Agit Çelik


In [1]:
#Importing required libraries
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np 
import pandas as pd 

#reading the csv data
trainData = pd.read_csv('restaurant-revenue-prediction/train.csv')
trainData.info()

trainData.head(5)

PermissionError: [Errno 13] Permission denied: 'restaurant-revenue-prediction/train.csv'

## PREPROCESSiNG  & SOME  ANALYSiS

In [None]:
#Converting Open Date column to Open Days; day count of the restaurant since the beginning
#and dropping the Open Date Columns
trainData['Open Date'] = pd.to_datetime(trainData['Open Date'], format='%m/%d/%Y')   
trainData['OpenDays']=""

dateLastTrain = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(trainData)]) })
dateLastTrain['Date'] = pd.to_datetime(dateLastTrain['Date'], format='%m/%d/%Y')  

trainData['OpenDays'] = dateLastTrain['Date'] - trainData['Open Date']
trainData['OpenDays'] = trainData['OpenDays'].astype('timedelta64[D]').astype(int)

trainData = trainData.drop('Open Date', axis=1)

In [None]:
#Comparing the revenues of big cities and other cities
cityPerc = trainData[["City Group", "revenue"]].groupby(['City Group'],as_index=False).mean()

sns.barplot(x='City Group', y='revenue', data=cityPerc)

In [None]:
#Sorting the cities by revenue; getting the max earned cities
cityPerc = trainData[["City", "revenue"]].groupby(['City'],as_index=False).mean()

newDF = cityPerc.sort_values(["revenue"],ascending= False)
sns.barplot(x='City', y='revenue', data=newDF.head(10))

In [None]:
cityPerc = trainData[["City", "revenue"]].groupby(['City'],as_index=False).mean()
newDF = cityPerc.sort_values(["revenue"],ascending= True)
sns.barplot(x='City', y='revenue', data=newDF.head(10))

In [None]:
#Getting an insight of which restaurant type earns more
cityPerc = trainData[["Type", "revenue"]].groupby(['Type'],as_index=False).mean()
sns.barplot(x='Type', y='revenue', data=cityPerc)

In [None]:
#Plot about working days of specific restaurant types
cityPerc = trainData[["Type", "OpenDays"]].groupby(['Type'],as_index=False).mean()
sns.barplot(x='Type', y='OpenDays', data=cityPerc)

In [None]:
#Dropping the Id and Type columns since they are irrevelant for our predictions
trainData = trainData.drop('Id', axis=1)

trainData = trainData.drop('Type', axis=1)

In [None]:
#Creating dummy variables to represent City Groups
citygroupDummy = pd.get_dummies(trainData['City Group'])
trainData = trainData.join(citygroupDummy)

#After doing dummy variables for City Group we dropped it
trainData = trainData.drop('City Group', axis=1)

trainData = trainData.drop('City', axis=1)

tempRev = trainData['revenue']
trainData = trainData.drop('revenue', axis=1)


trainData = trainData.join(tempRev)

In [None]:
trainData.head(10)

# Train and  Test Split for RandomForestClassifier

In [None]:


from sklearn.model_selection import train_test_split

X, y = trainData.iloc[:, 1:40].values, trainData.iloc[:, 40].values

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                )
    
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y[:20]

In [None]:
y_train[:20]

# For finding best features among others.

In [None]:
from sklearn.ensemble import RandomForestClassifier

feat_labels = trainData.columns[1:40]

forest = RandomForestClassifier(n_estimators=500,
                                random_state=1)
forest.fit(X_train, y_train)



importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]
#print X_train.shape[1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[indices[f]], 
                            importances[indices[f]]))
    
    


In [None]:
#hepsi
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), 
        importances[indices],
        align='center')

plt.xticks(range(X_train.shape[1]), 
           feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()


In [None]:
trainData[feat_labels[indices[0:39]]].head()

# Test and Train model created over best 19 features.

In [None]:
#Model will predict output by using best 19 features.


bestDataFeaturesTrain = trainData[feat_labels[indices[0:19]]]

y = trainData.iloc[:, 40].values

from sklearn.model_selection import train_test_split

X, y = trainData.iloc[:, 1:40].values, trainData.iloc[:, 40].values

X_trainForModel, X_testForModel, y_trainForModel, y_testForModel =\
    train_test_split(bestDataFeaturesTrain, y, 
                     test_size=0.3, 
                     random_state=0, 
                )



    
X_trainForModel.shape, X_testForModel.shape, y_trainForModel.shape, y_testForModel.shape

# Standardize features by removing the mean and scaling to unit variance

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_std  = True ,with_mean = True, copy = True)
X_train_std = sc.fit_transform(X_trainForModel)
X_test_std = sc.transform(X_testForModel)

In [None]:
X_train_std[:1]

# 

In [None]:
from sklearn.decomposition import PCA,KernelPCA

pca = PCA(n_components=2,svd_solver='full')
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
pca.explained_variance_ratio_

kpca = KernelPCA(kernel="rbf", gamma=1)
X_kpca = kpca.fit_transform(X_train_pca)
X_kpca_test = kpca.transform(X_test_pca)

X_train_pca[:1]


In [None]:
fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(10,5))
ax[0].scatter(X_train_pca[:, 0], X_train_pca[:, 1],color='red',marker='o')
ax[1].scatter(X_kpca[:, 0], X_kpca[:, 1])
ax[0].set_xlabel('Before RBF')
ax[1].set_yticks([])
ax[1].set_xlabel('After RBF')


In [None]:
X_test_pca[:1]

In [None]:
X_trainForModel.head()


In [None]:
X_train_std[:1]

In [None]:
X_testForModel.head()

In [None]:
X_testForModel.head()


In [None]:
y_testForModel

# RandomForestRegressor is used to predict "revenues"

In [None]:
yTrainTemp = trainData['revenue']


import numpy
from sklearn import linear_model
cls = RandomForestRegressor(n_estimators=250, criterion='mse', max_depth=30)#cls = RandomForestRegressor(n_estimators=150)

cls.fit(X_kpca, y_trainForModel)

scoreOfModel = cls.score(X_kpca, y_trainForModel)


print("Score is calculated as: ",scoreOfModel)

In [None]:
pred = cls.predict(X_kpca_test)
pred

In [None]:
from sklearn.ensemble import RandomForestClassifier
rclf = RandomForestClassifier()
rclf.fit(X_kpca, y_trainForModel)

In [None]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
y_pred = rclf.predict(X_kpca_test)
print(len(X_kpca_test))
print(accuracy_score(y_test,y_pred))


In [None]:
y_test[:10]

## Effect of estimators on score graphically shown below 

In [None]:
estimators = np.arange(10, 250, 10) # 10 to 250 increased with 10
scores = []
for n in estimators:
    cls.set_params(n_estimators=n)
    cls.fit(X_train_pca, y_trainForModel)
    scores.append(cls.score(X_train_pca, y_trainForModel))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)

# Plot Revenues(orenge line) and Predicted Revenues(blue line)

In [None]:

r = []
for pair in  zip(pred, y_testForModel):
    r.append(pair)

plt.plot(r)


In [None]:
pred[:20]

In [None]:
yTrainTemp.head(20)

In [None]:
for z in zip(y_testForModel, pred):
    print(z, (z[0]-z[1]) /z[0] )
