# Build Prediction Model

In [2]:
#import required modules
import pandas as pd
import numpy as np
import os

In [3]:
#set data path
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_data_path = os.path.join(processed_data_path, 'train_processed.csv')
test_data_path = os.path.join(processed_data_path, 'test_processed.csv')

In [4]:
#read files
df_train = pd.read_csv(train_data_path, index_col='outlet_no')
df_test = pd.read_csv(test_data_path, index_col='outlet_no')

In [5]:
df_train.shape

(344, 92)

In [6]:
df_test.shape

(255, 91)

In [7]:
df_test.head()

Unnamed: 0_level_0,business_type,zip,avg_age,blue_collar,white_collar,female,male,total_household_size,total_household_income,latitude,...,CSR_G,CSR_H,store_location_AT-WORK,store_location_EATING & DRINKING,store_location_EDUCATIONAL,store_location_ENTERTAINMENT/RECREATION/LEISURE,store_location_GROCERY SHOPPING,store_location_OTHER SHOPPING & SERVICES,store_location_THIRD PARTY (NON-CONSUMER),store_location_TRAVEL/TRANSPORTATION/HOSPITALITY
outlet_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,1,77057,33.96,22.42,44.74,50.76,49.24,1.58,74121000,29.74985,...,0,0,1,0,0,0,0,0,0,0
1002,1,57702,37.34,42.41,34.52,50.06,49.94,1.99,45196000,44.079441,...,0,0,1,0,0,0,0,0,0,0
1003,1,72830,36.06,54.47,33.87,50.46,49.54,2.58,47034000,35.482303,...,0,0,1,0,0,0,0,0,0,0
1004,1,57702,37.34,42.41,34.52,50.06,49.94,1.99,45196000,44.082562,...,0,0,0,0,0,0,0,0,1,0
1005,1,72830,33.64,50.7,34.93,51.3,48.7,2.56,45592000,35.472011,...,0,0,0,0,0,1,0,0,0,0


In [8]:
df_train.head()

Unnamed: 0_level_0,total_sales,business_type,zip,avg_age,blue_collar,white_collar,female,male,total_household_size,total_household_income,...,CSR_G,CSR_H,store_location_AT-WORK,store_location_EATING & DRINKING,store_location_EDUCATIONAL,store_location_ENTERTAINMENT/RECREATION/LEISURE,store_location_GROCERY SHOPPING,store_location_OTHER SHOPPING & SERVICES,store_location_THIRD PARTY (NON-CONSUMER),store_location_TRAVEL/TRANSPORTATION/HOSPITALITY
outlet_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
247,4070,1,45804,44.01,43.68,35.41,53.37,46.63,2.24,63860000,...,0,0,0,0,0,0,0,0,0,1
253,4323,4,95687,38.36,41.31,26.48,54.0,46.0,2.54,92764000,...,0,0,1,0,0,0,0,0,0,0
265,2163,4,45219,28.81,17.22,43.06,45.56,54.44,1.9,51027000,...,0,0,0,0,1,0,0,0,0,0
267,528,1,42141,42.89,38.49,44.0,52.41,47.59,2.24,47119000,...,0,0,1,0,0,0,0,0,0,0
276,528,1,42101,38.82,60.82,26.24,54.18,45.82,1.93,47941000,...,1,0,0,1,0,0,0,0,0,0


## split training data for cross validation 

In [9]:
#convert input and output features
X = df_train.loc[:,'business_type':].as_matrix().astype('float')
y = df_train['total_sales'].ravel()

In [10]:
print(X.shape)
print(y.shape)

(344, 91)
(344,)


In [11]:
#split data into 80/20 using train_test_split function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
print("train: {0}, {1}".format(X_train.shape, y_train.shape))
print("test: {0}, {1}".format(X_test.shape, y_test.shape))

train: (275, 91), (275,)
test: (69, 91), (69,)


In [13]:
#chack for average survival 
print("Average total_sales for train: {0:.3f}".format(np.mean(y_train)))
print("Average total_sales for test: {0:.3f}".format(np.mean(y_test)))

Average total_sales for train: 18915.025
Average total_sales for test: 21268.652


In [14]:
def get_submission_file(model, filename):
    #test data 
    test_X = df_test.as_matrix().astype('float')
    #prediction on test data 
    predictions = model.predict(test_X)
    #predicted data frames
    df_submission = pd.DataFrame({'outlet_no': df_test.index, 'total_sales': predictions})
    #set external file path
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    #create csv file for baseline model prediction
    df_submission.to_csv(submission_file_path, index=False)

### develop a base line model

In [15]:
from sklearn.dummy import DummyClassifier

#first create a dummy model
dummy_model = DummyClassifier(strategy='most_frequent', random_state=0)

#train a model
dummy_model.fit(X_train, y_train)

print("score of the baseline model: {0:.3f}".format(dummy_model.score(X_test, y_test)))

#imports performance matrices
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, precision_recall_curve

#predicted model
dummy_predicted_model = dummy_model.predict(X_test)

#accuracy 
print("Accuracy of Baseline model : {0:.3f}".format(accuracy_score(y_test, dummy_predicted_model)))

#confusion metices
print("Confusion Metrices of Baseline model : \n {0}".format(confusion_matrix(y_test, dummy_predicted_model)))

#precision and recall of baseline model
print("precision: {0}".format(precision_score(y_test, dummy_predicted_model, average='micro')))
print("recall: {0}".format(recall_score(y_test, dummy_predicted_model, average='micro')))

score of the baseline model: 0.072
Accuracy of Baseline model : 0.072
Confusion Metrices of Baseline model : 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
precision: 0.07246376811594203
recall: 0.07246376811594203


In [16]:
get_submission_file(dummy_model, 'dummy_model_prediction.csv')

## Logistic Regression

In [99]:
#import logistic regression
from sklearn.linear_model import LogisticRegression
logisticRg_model = LogisticRegression(random_state = 0)
logisticRg_model.fit(X_train, y_train)


print("score of the Logistic Regression model: {0:.3f}".format(logisticRg_model.score(X_test, y_test)))

score of the Logistic Regression model: 0.043


In [100]:
logistic_predicted_model = logisticRg_model.predict(X_test)

In [101]:
#confusion metices
print("Confusion Metrices of Logistic Regression model : \n {0}".format(confusion_matrix(y_test, logistic_predicted_model)))
#confusion metices
print("Confusion Metrices of Logistic Regression model : \n {0}".format(confusion_matrix(y_test, logistic_predicted_model)))
#precision and recall of baseline model
print("precision: {0}".format(precision_score(y_test, logistic_predicted_model, average='micro')))
print("recall: {0}".format(recall_score(y_test, logistic_predicted_model, average='micro')))

Confusion Metrices of Logistic Regression model : 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Confusion Metrices of Logistic Regression model : 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
precision: 0.043478260869565216
recall: 0.043478260869565216


In [102]:
get_submission_file(logisticRg_model, 'logistic_regression_prediction.csv')