## Importing Stuff

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

## Loading the datasets

In [38]:
data_raw = pd.read_csv("datasets/TrainData.csv", index_col="property_id")
data_validate = pd.read_csv("datasets/TestData.csv", index_col="property_id")
data_sample = pd.read_csv("datasets/Test - Sample Solution.csv", index_col="property_id")
data_dict = pd.read_csv("datasets/Data Dictionary.csv")

## Exploring the dataset

In [39]:
data_raw.sample(5)

Unnamed: 0_level_0,property_name,city,tier_city,hotel_star_rating,min_price,max_price,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ZRX12212,MARINA DESERT SAFARI,Ahmedabad,A,3,2576,3416,2996.0,2.8,4.0,1.9,1.8,4.6,3.1,1256,2.68,51,3.0,Standard
ZRX06352,Aura Hotel Malad,Kolkata,A1,3,2940,3760,3350.0,4.3,3.4,1.7,2.4,4.7,3.6,1063,2.6,115,2.7,Normal
ZRX05006,Hotel Narula,Jammu,B2,2,1114,1590,1352.0,3.6,2.2,3.0,4.6,1.7,3.6,1503,2.42,587,1.2,Normal
ZRX11525,FabHotel Legacy Brigade Road,Varanasi,B1,3,2267,2962,2614.5,1.2,2.1,1.2,3.8,4.8,3.5,1468,2.69,383,2.2,Super Value Plus
ZRX00386,Hotel Majha,Surat,A,1,1276,1647,1461.5,2.1,4.6,1.7,4.3,2.8,3.1,1361,2.56,59,3.7,Normal


In [40]:
pd.set_option('display.max_colwidth', -1)
data_dict

Unnamed: 0,Column Header,Description
0,property_id,Unique Identifier for each hotel [Character]
1,property_name,Name of the hotel [Character]
2,city,City in which the hotel is present [Character]
3,tier_city,What tier does the city belong to? [Categorical] Note: A1 > A > B1 > B2
4,hotel_star_rating,The star rating of each hotel [Categorical] Note: 0 star-5 star
5,min_price,Minimum tariff for the hotel [Numeric]
6,max_price,Maximum tariff for the hotel [Numeric]
7,avg_price,Average tariff for the hotel [Numeric]
8,rating_location,Average rating (out of 5) for the location of the hotel [Numeric]
9,rating_hospitality,Average rating (out of 5) for the hospitality at the hotel [Numeric]


Right off the bat, value_segment is going to be our labels AKA what our algorithm will predict. 

Then we have some identification details namely the property_id and the property_name.

Since we have the tier of the city, the name of the city itself will not be needed. While the name of the hotel might affect a person's rating, we're gonna have to ignore it because I don't have enough knowledge. :P

The remaining data is either categorical and thus can be converted to a numeric data or it is already numeric.

Since the average price is already calculated, we shall ignore the minimum and maximum prices.

In [41]:
data_raw.isnull().sum()

property_name             0
city                      0
tier_city                 0
hotel_star_rating         0
min_price                 0
max_price                 0
avg_price                 0
rating_location           0
rating_hospitality        0
rating_facilities         0
rating_cleanliness        0
rating_value_for_money    0
rating_food               0
overall_review_count      0
overall_review_score      0
site_review_count         0
site_review_rating        0
value_segment             0
dtype: int64

There are no null values in this dataset. 

In [42]:
data_raw.describe(include='all')

Unnamed: 0,property_name,city,tier_city,hotel_star_rating,min_price,max_price,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
count,14991,14991,14991,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991
unique,11184,76,4,,,,,,,,,,,,,,,5
top,Hotel President,Gwalior,B2,,,,,,,,,,,,,,,Normal
freq,18,230,6898,,,,,,,,,,,,,,,3804
mean,,,,1.824094,1572.163565,2045.603095,1808.88333,3.008532,2.987092,3.000067,3.025115,2.995477,3.010706,1250.195784,3.001412,316.157961,2.990127,
std,,,,1.090903,976.743578,1405.212498,1181.577229,1.156683,1.153252,1.150769,1.158748,1.159337,1.159539,322.617578,0.398412,204.836797,1.159166,
min,,,,0.0,430.0,611.0,520.5,1.0,1.0,1.0,1.0,1.0,1.0,186.0,1.44,0.0,1.0,
25%,,,,1.0,931.0,1190.0,1054.0,2.0,2.0,2.0,2.0,2.0,2.0,1028.0,2.74,144.0,2.0,
50%,,,,1.0,1323.0,1644.0,1485.0,3.0,3.0,3.0,3.0,3.0,3.0,1250.0,3.0,295.0,3.0,
75%,,,,3.0,1863.0,2280.5,2053.0,4.0,4.0,4.0,4.0,4.0,4.0,1469.0,3.26,460.0,4.0,


In [43]:
data_raw['value_segment'].value_counts()

Normal              3804
Value               2993
Standard            2993
Value Plus          2986
Super Value Plus    2215
Name: value_segment, dtype: int64

## Cleaning the data

Since there are no null values, there is no need of imputation. Instead we can simply get rid of the columns we don't need, namely property_name and city and then convert the categorical values into numerical values.

In [44]:
# For easier cleaning
data_cleaner = [data_raw, data_validate]

In [45]:
for dataset in data_cleaner:
    dataset['tier_city'].loc[dataset['tier_city'] == 'B2'] = 0
    dataset['tier_city'].loc[dataset['tier_city'] == 'B1'] = 1
    dataset['tier_city'].loc[dataset['tier_city'] == 'A'] = 2
    dataset['tier_city'].loc[dataset['tier_city'] == 'A1'] = 3
    dataset.drop(['property_name', 'city', 'min_price', 'max_price'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [46]:
data_cleaner[0].sample(5)

Unnamed: 0_level_0,tier_city,hotel_star_rating,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ZRX13388,0,3,1904.5,2.4,3.7,4.1,3.4,1.9,2.0,821,2.72,130,4.2,Normal
ZRX05160,2,3,3272.0,1.4,4.5,4.5,4.5,4.8,2.5,913,3.28,397,1.0,Super Value Plus
ZRX12169,0,1,1029.0,3.7,2.3,4.8,1.6,2.3,1.3,1466,3.35,698,3.6,Super Value Plus
ZRX16938,2,1,1446.0,3.0,1.9,4.7,1.7,5.0,4.4,1197,3.14,190,3.7,Standard
ZRX02613,1,1,1096.5,1.1,3.6,4.8,4.0,2.6,1.2,957,3.16,265,1.8,Normal


Average price has a lot of varation. Same for the overall review counts and the site review counts. We shall divide them into blocks instead.

In [47]:
for dataset in data_cleaner:
    dataset['avg_price'] = pd.qcut(dataset['avg_price'], 5, [0, 1, 2, 3, 4])
    dataset['overall_review_count'] = pd.cut(dataset['overall_review_count'], 5, labels=[0, 1, 2, 3, 4])
    dataset['site_review_count'] = pd.cut(dataset['site_review_count'], 7, labels=[0, 1, 2, 3, 4, 5 ,6])

In [48]:
data_cleaner[0].sample(5)

Unnamed: 0_level_0,tier_city,hotel_star_rating,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ZRX00303,2,3,4,2.7,1.2,4.3,4.4,3.8,3.2,2,2.27,1,2.8,Standard
ZRX11148,0,1,1,2.1,3.0,3.4,2.8,3.1,4.3,3,2.33,0,4.1,Value Plus
ZRX04888,3,4,4,4.3,2.9,4.8,1.1,3.7,4.1,3,2.96,0,4.7,Normal
ZRX13364,0,1,0,2.7,3.4,1.6,3.4,3.5,4.2,3,3.42,1,5.0,Value Plus
ZRX15662,3,1,3,5.0,2.3,2.5,4.1,1.1,3.0,2,3.28,1,4.8,Value Plus


## Splitting data into features and labels and then into training and testing set

In [49]:
data_clean, data_validate = data_cleaner
data_labels = data_clean['value_segment']
data_features = data_clean.drop('value_segment', axis=1)

In [50]:
features_train, features_test, labels_train, labels_test = train_test_split(data_features, data_labels,
                                                                            test_size=0.2, random_state=42)

## Gaussian Naive-Bayes

In [51]:
nb_classifier = GaussianNB()

In [52]:
t0 = time()
nb_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.028064727783203125s.


In [53]:
t1 = time()
nb_prediction = nb_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0030410289764404297s.


In [54]:
print("Accuracy: ", accuracy_score(labels_test, nb_prediction), ".", sep='')

Accuracy: 0.306102034011.


In [55]:
data_validate.drop('value_segment', axis=1, inplace=True)

In [56]:
t1 = time()
nb_prediction = nb_classifier.predict(data_validate)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.006987571716308594s.


In [57]:
data_sample['value_segment'] = nb_prediction
data_sample.to_csv("solutions/nb-solution.csv")
data_sample['value_segment'].value_counts()

Normal              2733
Value Plus          1130
Super Value Plus    825 
Value               208 
Standard            102 
Name: value_segment, dtype: int64

## Support Vector Machine

In [31]:
svm_classifier = svm.SVC(kernel='rbf', C=10000, gamma=10.0)

In [32]:
t0 = time()
svm_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 10.123085975646973s.


In [33]:
t1 = time()
svm_prediction = svm_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.9673519134521484s.


In [34]:
print("Accuracy: ", accuracy_score(labels_test, svm_prediction), ".", sep='')

Accuracy: 0.261753917973.


## Decision Tree

In [59]:
dt_classifier = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=400)

In [60]:
t0 = time()
dt_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.057875633239746094s.


In [61]:
t1 = time()
dt_prediction = dt_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0035316944122314453s.


In [62]:
print("Accuracy: ", accuracy_score(labels_test, dt_prediction), ".", sep='')

Accuracy: 0.293097699233.


In [63]:
t1 = time()
dt_prediction = dt_classifier.predict(data_validate)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0063037872314453125s.


In [66]:
data_sample['value_segment'] = dt_prediction
data_sample.to_csv("solutions/dt-solution.csv")
data_sample['value_segment'].value_counts()

Normal              2287
Super Value Plus    997 
Value Plus          750 
Standard            522 
Value               442 
Name: value_segment, dtype: int64