In [22]:
# Feature Importance
from sklearn import datasets
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings

# Ignore all the warnings
warnings.filterwarnings("ignore")

In [23]:
# load the iris datasets
hotel = pd.read_csv('D:\Training\hotel_bookings.csv')
print("Whole data")
hotel.head()

Whole data


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [24]:
print("Subset the data")
temp = hotel.iloc[:, [0, 1, 12, 13, 14, 15]]
temp.head(5)

Subset the data


Unnamed: 0,hotel,is_canceled,meal,country,market_segment,distribution_channel
0,Resort Hotel,0,BB,PRT,Direct,Direct
1,Resort Hotel,0,BB,PRT,Direct,Direct
2,Resort Hotel,0,BB,GBR,Direct,Direct
3,Resort Hotel,0,BB,GBR,Corporate,Corporate
4,Resort Hotel,0,BB,GBR,Online TA,TA/TO


In [25]:
label_encoder = LabelEncoder()
temp['hotel'] = label_encoder.fit_transform(temp['hotel'])
temp['meal'] = label_encoder.fit_transform(temp['meal'])

temp['country'] = temp['country'].astype(str)
temp['country'] = label_encoder.fit_transform(temp['country'])
temp['distribution_channel'] = temp['distribution_channel'].astype(str)
temp['distribution_channel'] = label_encoder.fit_transform(temp['distribution_channel'])
temp['market_segment'] = temp['market_segment'].astype(str)
temp['market_segment'] = label_encoder.fit_transform(temp['market_segment'])

In [26]:
temp.head()

Unnamed: 0,hotel,is_canceled,meal,country,market_segment,distribution_channel
0,1,0,0,135,3,1
1,1,0,0,135,3,1
2,1,0,0,59,3,1
3,1,0,0,59,2,0
4,1,0,0,59,6,3


In [27]:
temp['country'].nunique()

178

In [30]:
hotel['country'].head()

0    PRT
1    PRT
2    GBR
3    GBR
4    GBR
Name: country, dtype: object

In [31]:
hotel['country'].nunique()

177

In [32]:
hotel['country'].isnull().sum()

488

In [28]:
temp['is_canceled'].nunique()

2

## Input Features & Output Feature

In [33]:
X = temp.drop('is_canceled', axis=1)
y = temp['is_canceled']

In [34]:
print("Input Features")
print(X.head(5))

Input Features
   hotel  meal  country  market_segment  distribution_channel
0      1     0      135               3                     1
1      1     0      135               3                     1
2      1     0       59               3                     1
3      1     0       59               2                     0
4      1     0       59               6                     3


In [35]:
print("Output Label")
print(y.head(5))

Output Label
0    0
1    0
2    0
3    0
4    0
Name: is_canceled, dtype: int64


In [36]:
%%time
svm = LinearSVC()
# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(estimator=svm, n_features_to_select=3)
rfe = rfe.fit(X, y)

Wall time: 1min 16s


In [37]:
# print summaries for the selection of attributes
print("RFE Summary:")
print("Support", rfe.support_)
print("Ranking", rfe.ranking_)
print("Number of selected Feature: ", rfe.n_features_to_select)
#print("Feature Importance: ", rfe.feature_importances)

RFE Summary:
Support [ True False False  True  True]
Ranking [1 2 3 1 1]
Number of selected Feature:  3


In [38]:
# Print the selection Status
print("Status of Variable")
for i in range(X.shape[1]):
    print('Column: %d, Column Name: %s, Selected: %s, Rank: %.3f' % \
          (i, X.columns[i], rfe.support_[i], rfe.ranking_[i]))

Status of Variable
Column: 0, Column Name: hotel, Selected: True, Rank: 1.000
Column: 1, Column Name: meal, Selected: False, Rank: 2.000
Column: 2, Column Name: country, Selected: False, Rank: 3.000
Column: 3, Column Name: market_segment, Selected: True, Rank: 1.000
Column: 4, Column Name: distribution_channel, Selected: True, Rank: 1.000


In [39]:
hotel['hotel'].value_counts()

City Hotel      79330
Resort Hotel    40060
Name: hotel, dtype: int64