 # 1.. lets read data..

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv(r'hotel_bookings.csv')

In [None]:
type(df)

In [None]:
df.head(3)

# 2.. lets perform data cleaning..

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop(['agent','company'],axis=1,inplace=True)

In [None]:
df['country'].value_counts().index[0]

In [None]:
df['country'].fillna(df['country'].value_counts().index[0],inplace=True)

In [None]:
df.fillna(0,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
### seems to have some dirtiness in data as Adults,babies & children cant be zero at a same time ..

### bcz if 3 entities are 0 ,then how can a booking be possible ??

In [None]:
### Visualise Entire Dataframe where adult,children & babies are 0

filter1=(df['children']==0) & (df['adults']==0) & (df['babies']==0)

In [None]:
df[filter1]

In [None]:
data=df[~filter1]

In [None]:
data.shape

In [None]:
df.shape

In [None]:
df[df['children']==0]

# 3.. Where do the guests come from ?

In [None]:
## Lets perform Spatial Analysis

In [None]:
data['is_canceled'].unique()

In [None]:
data[data['is_canceled']==0]['country'].value_counts()/75011

In [None]:
len(data[data['is_canceled']==0])

In [None]:
country_wise_data=data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns=['country','no_of_guests']
country_wise_data

In [None]:
##!pip install plotly

In [None]:
##!pip install chart_studio

In [None]:
import plotly
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs ,init_notebook_mode ,plot ,iplot
init_notebook_mode(connected=True)

In [None]:
import plotly.express as px

In [None]:
map_guest=px.choropleth(country_wise_data,
             locations=country_wise_data['country'],
             color=country_wise_data['no_of_guests'],
              hover_name=country_wise_data['country'],
              title='home country of guests'
             )

In [None]:
map_guest.show()

# 4.. How much do guests pay for a room per night ?

In [None]:
data2=data[data['is_canceled']==0]

In [None]:
data2.columns

In [None]:
# seaborn boxplot:

plt.figure(figsize=(12,8))
sns.boxplot(x='reserved_room_type',y='adr' ,hue='hotel',data=data2)

plt.title('Price of room types per night and person')
plt.xlabel('room types')
plt.ylabel('price( EUR)')

# 5.. Which are the most busy month ?

In [None]:
data['hotel'].unique()

In [None]:
data_resort=data[(data['hotel']=='Hilton Hotel') & (data['is_canceled']==0)]
data_city = data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]

In [None]:
data_resort.head(3)

In [None]:
rush_resort=data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month','no_of_guests']
rush_resort

In [None]:
rush_city=data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month','no_of_guests']
rush_city

In [None]:
final_rush=rush_resort.merge(rush_city,on='month')

In [None]:
final_rush.columns=['month','no_of_guests_in_resort','no_of_guests_city']

In [None]:
final_rush

In [None]:
!pip install sorted-months-weekdays

## Dependency package needs to be installed
!pip install sort_dataframeby_monthorweek

In [None]:
import sort_dataframeby_monthorweek as sd

In [None]:
final_rush=sd.Sort_Dataframeby_Month(final_rush,'month')

In [None]:
final_rush.columns

In [None]:
px.line(data_frame=final_rush,x='month',y=['no_of_guests_in_resort', 'no_of_guests_city'])

# 6.. which month has highest adr ?

In [None]:
data=sd.Sort_Dataframeby_Month(data,'arrival_date_month')

In [None]:
sns.barplot(x='arrival_date_month',y='adr',data=data ,hue='is_canceled')
plt.xticks(rotation='vertical')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='arrival_date_month',y='adr',data=data ,hue='is_canceled')
plt.xticks(rotation='vertical')


plt.ylim(0,800)
plt.show()

# 7.. Lets analyse whether bookings were made only for weekdays or for weekends or for both ??

In [None]:
data.columns

In [None]:
### Lets create a relationship table..
pd.crosstab(index=data['stays_in_weekend_nights'],columns=data['stays_in_week_nights'])

In [None]:
## lets define our own function :

def week_function(row):
    feature1='stays_in_weekend_nights'
    feature2='stays_in_week_nights'
    
    if row[feature2]==0 and row[feature1] >0 :
        return 'stay_just_weekend'
    
    elif row[feature2]>0 and row[feature1] ==0 :
        return 'stay_just_weekdays'
    
    elif row[feature2]>0 and row[feature1] >0 :
        return 'stay_both_weekdays_weekends'
    
    else:
        return 'undefined_data'

In [None]:
data2['weekend_or_weekday']=data2.apply(week_function,axis=1)

In [None]:
data2.head(2)

In [None]:
data2['weekend_or_weekday'].value_counts()

In [None]:
type(sd)

In [None]:
data2=sd.Sort_Dataframeby_Month(data2,'arrival_date_month')

In [None]:
data2.groupby(['arrival_date_month','weekend_or_weekday']).size()

In [None]:
group_data=data2.groupby(['arrival_date_month','weekend_or_weekday']).size().unstack().reset_index()

In [None]:
sorted_data=sd.Sort_Dataframeby_Month(group_data,'arrival_date_month')

In [None]:
sorted_data.set_index('arrival_date_month',inplace=True)

In [None]:
sorted_data

In [None]:
sorted_data.plot(kind='bar',stacked=True,figsize=(15,10))

# 8.. How to create some more features ..

In [None]:
data2.columns

In [None]:
def family(row):
    if (row['adults']>0) &  (row['children']>0 or row['babies']>0) :
        return 1
    else:
        return 0

In [None]:
data['is_family']=data.apply(family,axis=1)

In [None]:
data['total_customer'] = data['adults'] + data['babies'] + data['children']

In [None]:
data['total_nights']=data['stays_in_week_nights'] + data['stays_in_weekend_nights']

In [None]:
data.head(3)

In [None]:
data.columns

In [None]:
data['deposit_type'].unique()

In [None]:
dict1={'No Deposit':0, 'Non Refund':1, 'Refundable': 0}

In [None]:
data['deposit_given']=data['deposit_type'].map(dict1)

In [None]:
data.columns

In [None]:
data.drop(columns=['adults', 'children', 'babies', 'deposit_type'],axis=1,inplace=True)

In [None]:
data.columns

# 9.. how to apply Feature encoding on data 

In [None]:
data.head(6)

In [None]:
data.dtypes

In [None]:
data.columns

In [None]:
cate_features=[col for col in data.columns if data[col].dtype=='object']

In [None]:
num_features=[col for col in data.columns if data[col].dtype!='object']

In [None]:
num_features

In [None]:
cate_features

In [None]:
data_cat=data[cate_features]

In [None]:
data.groupby(['hotel'])['is_canceled'].mean().to_dict()

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
data_cat['cancellation']=data['is_canceled']

In [None]:
data_cat.head()

In [None]:
cols=data_cat.columns

In [None]:
cols=cols[0:-1]

In [None]:
cols

In [None]:
### Perform Mean Encoding Technique 

for col in cols:
    dict2=data_cat.groupby([col])['cancellation'].mean().to_dict()
    data_cat[col]=data_cat[col].map(dict2)

In [None]:
data_cat.head(3)

# 10.. Handle Outliers 

In [None]:
data[num_features]

In [None]:
dataframe=pd.concat([data_cat,data[num_features]],axis=1)

In [None]:
dataframe.columns

In [None]:
dataframe.drop(['cancellation'],axis=1,inplace=True)

In [None]:
dataframe.head(3)

In [None]:
sns.distplot(dataframe['lead_time'])

In [None]:
def handle_outlier(col):
    dataframe[col]=np.log1p(dataframe[col])

In [None]:
handle_outlier('lead_time')

In [None]:
sns.distplot(dataframe['lead_time'])

In [None]:
## adr

In [None]:
sns.distplot(dataframe['adr'])

In [None]:
dataframe[dataframe['adr']<0]

In [None]:
handle_outlier('adr')

In [None]:
dataframe['adr'].isnull().sum()

In [None]:
### now why this missing value , as we have already deal with the missing values..'
### bcz we have negative value in 'adr' feature as '-6.38'  ,& if we apply ln(1+x) , we will get 'nan'
## bcz log wont take negative values..

In [None]:
sns.distplot(dataframe['adr'].dropna())

# 11.. Select important Features using Co-relation & univariate analysis..

In [None]:
sns.FacetGrid(data,hue='is_canceled',xlim=(0,500)).map(sns.kdeplot,'lead_time',shade=True).add_legend()

In [None]:
corr=dataframe.corr()

In [None]:
corr

In [None]:
corr['is_canceled'].sort_values(ascending=False)

In [None]:
corr['is_canceled'].sort_values(ascending=False).index

In [None]:
features_to_drop=['reservation_status', 'reservation_status_date','arrival_date_year',
       'arrival_date_week_number', 'stays_in_weekend_nights',
       'arrival_date_day_of_month']

In [None]:
dataframe.drop(features_to_drop,axis=1,inplace=True)

In [None]:
dataframe.shape

# 12.. How to find Important features for model building..

In [None]:
dataframe.head(2)

In [None]:
dataframe.isnull().sum()

In [None]:
dataframe.dropna(inplace=True)

In [None]:
## separate dependent & independent features

In [None]:
x=dataframe.drop('is_canceled',axis=1)

In [None]:
y=dataframe['is_canceled']

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
##Lasso(alpha=0.005)
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.


In [None]:
feature_sel_model=SelectFromModel(Lasso(alpha=0.005))

In [None]:
feature_sel_model.fit(x,y)

In [None]:
feature_sel_model.get_support()

In [None]:
cols=x.columns

In [None]:
cols

In [None]:
# let's print the number of selected features

selected_feature=cols[feature_sel_model.get_support()]

In [None]:
selected_feature

In [None]:
x=x[selected_feature]

In [None]:
y

# 13.. Lets build ML model..

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.25)

In [None]:
X_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
logreg=LogisticRegression()

In [None]:
logreg.fit(X_train,y_train)

In [None]:
pred=logreg.predict(X_test)

In [None]:
pred

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,pred)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test,pred)

# 14.. How to cross-validate model..

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
score=cross_val_score(logreg,x,y,cv=10)

In [None]:
score

In [None]:
score.mean()

# 15.. playing with multiple algos..

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier



In [None]:
models=[]

models.append(('LogisticRegression',LogisticRegression()))
models.append(('Naive_bayes',GaussianNB()))
models.append(('Random Forest',RandomForestClassifier()))
models.append(('Decision_tree',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier()))

In [None]:
for name,model in models:
    print(name)
    model.fit(X_train,y_train)
    
    predictions=model.predict(X_test)
    
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(predictions,y_test)
    print(cm)
    
    from sklearn.metrics import accuracy_score
    acc=accuracy_score(predictions,y_test)
    print(acc)
    print('\n')