## Importing Stuff

In [29]:
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

## Loading the datasets

In [30]:
data_raw = pd.read_csv("datasets/TrainData.csv", index_col="property_id")
data_validate = pd.read_csv("datasets/TestData.csv", index_col="property_id")
data_sample = pd.read_csv("datasets/Test - Sample Solution.csv", index_col="property_id")
data_dict = pd.read_csv("datasets/Data Dictionary.csv")

## Exploring the dataset

In [31]:
data_raw.sample(5)

Unnamed: 0_level_0,property_name,city,tier_city,hotel_star_rating,min_price,max_price,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ZRX11622,Elparadiso,Thiruvananthapuram,B1,1,859,1257,1058.0,4.3,2.9,3.1,3.8,3.1,5.0,837,2.53,157,1.4,Normal
ZRX03143,Hotel Ramakrishna,Mysore,B2,1,858,1013,935.5,2.9,4.8,1.8,2.4,3.1,2.7,1297,2.47,164,4.7,Normal
ZRX12866,Hotel Grand Liz,Tiruchirapalli,B2,1,816,1142,979.0,1.2,3.6,1.3,4.7,1.7,3.9,1236,2.6,133,1.4,Standard
ZRX07171,Explore Himalaya Resort,Bhavnagar,B2,1,980,1162,1071.0,4.2,4.1,1.6,4.3,1.3,2.7,787,2.08,250,2.1,Value Plus
ZRX16651,LA O LA BOUTIQUE HOTEL,Jodhpur,B2,1,790,1048,919.0,4.7,2.5,3.1,1.3,2.9,4.4,1642,3.0,365,1.5,Normal


In [32]:
pd.set_option('display.max_colwidth', -1)
data_dict

Unnamed: 0,Column Header,Description
0,property_id,Unique Identifier for each hotel [Character]
1,property_name,Name of the hotel [Character]
2,city,City in which the hotel is present [Character]
3,tier_city,What tier does the city belong to? [Categorical] Note: A1 > A > B1 > B2
4,hotel_star_rating,The star rating of each hotel [Categorical] Note: 0 star-5 star
5,min_price,Minimum tariff for the hotel [Numeric]
6,max_price,Maximum tariff for the hotel [Numeric]
7,avg_price,Average tariff for the hotel [Numeric]
8,rating_location,Average rating (out of 5) for the location of the hotel [Numeric]
9,rating_hospitality,Average rating (out of 5) for the hospitality at the hotel [Numeric]


Right off the bat, value_segment is going to be our labels AKA what our model will predict. 

Then we have some identification details namely the property_id and the property_name.

Since we have the tier of the city, the name of the city itself will not be needed. Same for the name of the hotel.

The remaining data is either categorical and thus can be converted to a numeric data or it is already numeric.

Since the average price is already calculated, we shall ignore the minimum and maximum prices.

In [33]:
data_raw.isnull().sum()

property_name             0
city                      0
tier_city                 0
hotel_star_rating         0
min_price                 0
max_price                 0
avg_price                 0
rating_location           0
rating_hospitality        0
rating_facilities         0
rating_cleanliness        0
rating_value_for_money    0
rating_food               0
overall_review_count      0
overall_review_score      0
site_review_count         0
site_review_rating        0
value_segment             0
dtype: int64

There are no null values in this dataset. 

In [34]:
data_raw.describe(include='all')

Unnamed: 0,property_name,city,tier_city,hotel_star_rating,min_price,max_price,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
count,14991,14991,14991,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991.0,14991
unique,11184,76,4,,,,,,,,,,,,,,,5
top,Hotel President,Gwalior,B2,,,,,,,,,,,,,,,Normal
freq,18,230,6898,,,,,,,,,,,,,,,3804
mean,,,,1.824094,1572.163565,2045.603095,1808.88333,3.008532,2.987092,3.000067,3.025115,2.995477,3.010706,1250.195784,3.001412,316.157961,2.990127,
std,,,,1.090903,976.743578,1405.212498,1181.577229,1.156683,1.153252,1.150769,1.158748,1.159337,1.159539,322.617578,0.398412,204.836797,1.159166,
min,,,,0.0,430.0,611.0,520.5,1.0,1.0,1.0,1.0,1.0,1.0,186.0,1.44,0.0,1.0,
25%,,,,1.0,931.0,1190.0,1054.0,2.0,2.0,2.0,2.0,2.0,2.0,1028.0,2.74,144.0,2.0,
50%,,,,1.0,1323.0,1644.0,1485.0,3.0,3.0,3.0,3.0,3.0,3.0,1250.0,3.0,295.0,3.0,
75%,,,,3.0,1863.0,2280.5,2053.0,4.0,4.0,4.0,4.0,4.0,4.0,1469.0,3.26,460.0,4.0,


In [35]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14991 entries, ZRX00001 to ZRX19989
Data columns (total 18 columns):
property_name             14991 non-null object
city                      14991 non-null object
tier_city                 14991 non-null object
hotel_star_rating         14991 non-null int64
min_price                 14991 non-null int64
max_price                 14991 non-null int64
avg_price                 14991 non-null float64
rating_location           14991 non-null float64
rating_hospitality        14991 non-null float64
rating_facilities         14991 non-null float64
rating_cleanliness        14991 non-null float64
rating_value_for_money    14991 non-null float64
rating_food               14991 non-null float64
overall_review_count      14991 non-null int64
overall_review_score      14991 non-null float64
site_review_count         14991 non-null int64
site_review_rating        14991 non-null float64
value_segment             14991 non-null object
dtypes: float64(9)

In [36]:
data_raw['value_segment'].value_counts()

Normal              3804
Value               2993
Standard            2993
Value Plus          2986
Super Value Plus    2215
Name: value_segment, dtype: int64

## Cleaning the data

Since there are no null values, there is no need of imputation. Instead we can simply get rid of the columns we don't need, namely property_name, city, min and max price. We will also conver the A1>A>B1>B2 scale to 3>2>1>0 for the city tier.

In [37]:
# For easier cleaning
data_cleaner = [data_raw, data_validate]

In [38]:
for dataset in data_cleaner:
    dataset['tier_city'].replace('B2', 0, inplace=True)
    dataset['tier_city'].replace('B1', 1, inplace=True)
    dataset['tier_city'].replace('A', 2, inplace=True)
    dataset['tier_city'].replace('A1', 3, inplace=True)
    dataset.drop(['property_name', 'city', 'min_price', 'max_price'], axis=1, inplace=True)

In [39]:
data_cleaner[0].head()

Unnamed: 0_level_0,tier_city,hotel_star_rating,avg_price,rating_location,rating_hospitality,rating_facilities,rating_cleanliness,rating_value_for_money,rating_food,overall_review_count,overall_review_score,site_review_count,site_review_rating,value_segment
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ZRX00001,0,3,2074.5,2.2,2.1,4.9,5.0,1.6,1.4,1254,3.0,304,3.2,Value
ZRX00004,0,1,1008.5,3.0,2.4,2.0,3.4,1.7,4.9,1414,2.42,167,4.8,Value
ZRX00005,0,3,1981.0,1.9,3.1,5.0,2.7,4.5,3.1,1704,3.04,507,2.3,Standard
ZRX00006,0,3,1944.5,4.3,2.1,2.6,3.2,4.9,4.9,1031,2.97,4,4.8,Standard
ZRX00007,0,3,2005.5,4.1,3.2,3.5,4.4,4.0,2.9,1533,3.08,527,3.5,Value


## Splitting data into features and labels and then into training and testing set

In [40]:
data_clean, data_validate = data_cleaner
data_labels = data_clean['value_segment']
data_features = data_clean.drop('value_segment', axis=1)

In [41]:
features_train, features_test, labels_train, labels_test = train_test_split(data_features, data_labels,
                                                                            test_size=0.2, random_state=42)

In [42]:
data_validate.drop('value_segment', axis=1, inplace=True)

## Gaussian Naive-Bayes

In [43]:
nb_classifier = GaussianNB()

In [44]:
t0 = time()
nb_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.012069225311279297s.


In [45]:
t1 = time()
nb_prediction = nb_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0020062923431396484s.


In [46]:
print("Accuracy: ", accuracy_score(labels_test, nb_prediction), ".", sep='')

Accuracy: 0.3081027009.


In [47]:
t1 = time()
nb_prediction = nb_classifier.predict(data_validate)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0029795169830322266s.


In [48]:
data_sample['value_segment'] = nb_prediction
data_sample.to_csv("solutions/nb-solution.csv")
data_sample['value_segment'].value_counts()

Normal              2608
Value Plus          1176
Super Value Plus    800 
Standard            225 
Value               189 
Name: value_segment, dtype: int64

## Decision Tree

In [49]:
dt_classifier = tree.DecisionTreeClassifier(min_samples_split=50)

In [50]:
t0 = time()
dt_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.16903972625732422s.


In [51]:
t1 = time()
dt_prediction = dt_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0029227733612060547s.


In [52]:
print("Accuracy: ", accuracy_score(labels_test, dt_prediction), ".", sep='')

Accuracy: 0.565521840614.


In [53]:
t1 = time()
dt_prediction = dt_classifier.predict(data_validate)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0020058155059814453s.


In [54]:
data_sample['value_segment'] = dt_prediction
data_sample.to_csv("solutions/dt-solution.csv")
data_sample['value_segment'].value_counts()

Normal              1157
Value Plus          1042
Value               1023
Standard            979 
Super Value Plus    797 
Name: value_segment, dtype: int64