In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
idx=pd.IndexSlice
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
#Data Cleaning 

In [None]:
data = pd.read_csv('booking_hotel.csv', encoding='latin1')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
data.loc[data.isna().any(axis=1)]

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated(subset='Hotel Name').sum()

In [None]:
data.drop_duplicates(subset='Hotel Name', inplace=True)

In [None]:
data.dropna(inplace=True)

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data['Rating'] = data['Rating'].astype('float')

In [None]:
data['Number of   ']=data['Number of   '].str.replace(',', '')

In [None]:
data['Number of   ']=data['Number of   '].str.replace('review', '')

In [None]:
data['Number of   '] = data['Number of   '].astype('float')

In [None]:
data['Room    Score'] = data['Room    Score'].astype('float')

In [None]:
data['Room Price (in BDT or any other currency)'] = \
data['Room Price (in BDT or any other currency)'].str.replace(r'\xa0', '')

data['Room Price (in BDT or any other currency)'] = \
data['Room Price (in BDT or any other currency)'].str.replace(',', '')

data['Room Price (in BDT or any other currency)'] = \
data['Room Price (in BDT or any other currency)'].str.replace('  ', '')

In [None]:
data['Room Price (in BDT or any other currency)'] = data['Room Price (in BDT or any other currency)'].astype(float)

In [None]:
data.rename(columns={
    'Number of   ':'Number of Reviews',
    'Room    Score':'Room Score',
    'Room Price (in BDT or any other currency)':'Room Price'
}, inplace=True)

In [None]:
#Converting BDT to USD
data['Room Price'] = round(data['Room Price'] * 0.0092)

In [None]:
data['Location'] = data.Location.str.split(',').str.get(-1)

In [None]:
data['Location'] = data.Location.str.strip()

In [None]:
#In data analysis 

In [None]:
#Top 20 location where hotels are located
df = data.Location.value_counts().nlargest(20).reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(data=df, y='index', x='Location')
plt.title('Top 20 Location where Hotels are Located in Data', fontsize=12)

In [None]:
data = data.loc[data['Review Score']!='Review score'].reset_index(drop=True)

In [None]:
data['Review Score'].unique()

In [None]:
#Hotel Review Score Value_counts
df = data['Review Score'].value_counts().reset_index()

fig, ax = plt.subplots()

ax.pie(
    x=df['Review Score'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0, 0, 0]
            )
plt.title('Hotel Review Score Value Counts', fontsize=12);

In [None]:
#Histograms for different rating
for x in data.columns:
    if data[x].dtype != 'O':
        sns.histplot(data[x])
        plt.show()

In [None]:
# Categorize Bed Type to Bed Count

In [None]:
data['Bed Type'] = data['Bed Type'].str.strip()

In [None]:
data['Bed Type'].unique()

In [None]:
one_bed =\
data.loc[(data['Bed Type'].str.startswith('1')) &\
         (~data['Bed Type'].str.contains('11 beds')) &\
         (~data['Bed Type'].str.contains('1 double'))]['Bed Type'].unique()

one_bed

In [None]:
def bed(data, feature):
    blanks = []
    for x in data[feature]:
        if x in one_bed :
            blanks.append('1 bed')
        elif str(x).startswith('1 double'):
            blanks.append('2 beds')
        elif str(x).startswith('2'):
            blanks.append('2 beds')
        elif str(x).startswith('3'):
            blanks.append('3 beds')
        elif str(x).startswith('4'):
            blanks.append('4 beds')
        elif str(x).startswith('5'):
            blanks.append('5 beds')
        elif str(x).startswith('6'):
            blanks.append('6 beds')
        elif str(x).startswith('7'):
            blanks.append('7 beds')
        elif str(x).startswith('8'):
            blanks.append('8 beds')
        elif str(x).startswith('11'):
            blanks.append('11 beds')
        else:
            blanks.append('unknown')
    return blanks

In [None]:
data['bed'] = bed(data, 'Bed Type')

In [None]:
data.rename(columns={'bed':'Bed Count'}, inplace=True)

In [None]:
df = data['Bed Count'].value_counts().reset_index()
plt.figure(figsize=(12, 4))
sns.barplot(data=df, x='Bed Count', y='index')
plt.title('Bed Count Distribution in Data', fontsize=12)

In [None]:
#Rating Analysis

In [None]:
sns.histplot(data['Rating'])

In [None]:
for x in ['Location', 'Review Score', 'Bed Count']:
    print('==============')
    print(data.groupby(x)['Rating'].mean().sort_values(ascending=False))
    print('')

In [None]:
for x in ['Review Score', 'Bed Count']:
    df = data.groupby(x)['Rating'].mean().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(18, 4))
    sns.lineplot(data=df, x=x, y='Rating', marker='o')
    plt.title(f'Avg Rating for Different {x}', fontsize=12)

In [None]:
#Percentage of hotels in terms of rating

def rating(data, feature):
    blanks=[]
    for x in data[feature]:
        if 9 <= x <= 10:
            blanks.append('9-10')
        elif 8 <= x < 9:
            blanks.append('8-9')
        elif 7 <= x < 8:
            blanks.append('7-8')
        elif 5 <= x < 7:
            blanks.append('5-7')
        elif 3 <= x < 5:
            blanks.append('3-5')
        else:
            blanks.append('0-3')
    return blanks


In [None]:
data['Rating Index'] = rating(data, 'Rating')

data['Rating Index'].value_counts()

In [None]:
df = data['Rating Index'].value_counts().reset_index()

fig, ax = plt.subplots()

ax.pie(
    x=df['Rating Index'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0]
            )

plt.title('Hotel Rating Index Distribution', fontsize=12);

#Majority of hotels (70%) of hotels have the rating between 8-9

In [None]:
df = data.groupby('Location')['Rating'].mean().sort_values(ascending=False).reset_index()

def avg_rating(data, feature):
    blanks=[]
    for x in data[feature]:
        if 9.5 <= x <= 10:
            blanks.append('9.5-10')
        elif 9 <= x < 9.5:
            blanks.append('9.0-9.5')
        elif 8.5 <= x < 9.5:
            blanks.append('8.5-9.0')
        elif 8 <= x < 8.5:
            blanks.append('8.0-8.5')
        elif 7.5 <= x < 8:
            blanks.append('7.5-8.0')
        else:
            blanks.append('7.0-7.5')
    return blanks

df['Rating Index'] = avg_rating(df, 'Rating')

In [None]:
df

In [None]:
df = df['Rating Index'].value_counts().reset_index()
fig, ax = plt.subplots(figsize=(7,7))

ax.pie(
    x=df['Rating Index'],
#     startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0, 0, 0, 0]
            )

plt.title('Location of Avg Rating Index Distribution', fontsize=12);

#41% of Location in the dataset has the avg rating between 8.0-8.5

In [None]:
#Top 20 locations with highest avg hotel rating
df=data.groupby('Location')['Rating'].mean().sort_values(ascending=False)[:20].reset_index()
plt.figure(figsize=(16, 5))
sns.barplot(data=df, x='Rating', y='Location')
plt.axvline(data.groupby('Location')['Rating'].mean().mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Location Hotel Rating'])
plt.title('Top 20 Location with Highest Avg Hotel Rating');

In [None]:
#Number of Reviews analysis

In [None]:
sns.histplot(data['Number of Reviews'])

In [None]:
#Top 20 Hotels with highest Number of Comments
df=data.loc[:, ["Hotel Name", "Number of Reviews"]].sort_values('Number of Reviews',ascending=False)[:20]
plt.figure(figsize=(16, 8))
sns.barplot(data=df, x='Number of Reviews', y='Hotel Name')
plt.axvline(data['Number of Reviews'].mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Number of Reviews'])
plt.title('Top 20 Hotels with Most Number of Reviews');

In [None]:
for x in ['Location', 'Review Score', 'Bed Count']:
    print('==============')
    print(data.groupby(x)['Number of Reviews'].mean().sort_values(ascending=False))
    print('')

In [None]:
#Number of Reviews Index Distribution in data

In [None]:
def reviews(data, feature):
    blanks=[]
    for x in data[feature]:
        if 0 <= x <= 100:
            blanks.append('0-100')
        elif 100 <= x < 1000:
            blanks.append('100-1000')
        elif 1000 <= x < 5000:
            blanks.append('1000-5000')
        elif 5000 <= x < 10000:
            blanks.append('5000-10000')
        else:
            blanks.append('10000+')
    return blanks

In [None]:
data['Reviews Index'] = reviews(data, 'Number of Reviews')
data['Reviews Index'].value_counts()

In [None]:
df = data['Reviews Index'].value_counts().reset_index()
fig, ax = plt.subplots(figsize=(6,6))

ax.pie(
    x=df['Reviews Index'],
#     startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.06, 0, 0, 0, 0]
            )

plt.title('Number of Reviews Index Distribution in Data', fontsize=12);

In [None]:
df=data.groupby('Location')['Number of Reviews'].mean().sort_values(ascending=False).reset_index()

df['Number of Reviews Index'] = reviews(df, 'Number of Reviews')

df

In [None]:
df = df['Number of Reviews Index'].value_counts().reset_index()
fig, ax = plt.subplots(figsize=(6,6))

ax.pie(
    x=df['Number of Reviews Index'],
#     startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0, 0, 0]
            )

plt.title('Location of Avg Number of Reviews Index Distribution', fontsize=12);

In [None]:
for x in ['Review Score', 'Bed Count']:
    df = data.groupby(x)['Number of Reviews'].mean().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(18, 4))
    sns.lineplot(data=df, x=x, y='Number of Reviews', marker='o')
    plt.title(f'Avg Number of Reviews for Different {x}', fontsize=12)

In [None]:
# Room Score Analysis 

In [None]:
sns.histplot(data['Room Score']);

In [None]:
for x in ['Location', 'Review Score', 'Bed Count']:
    print('==============')
    print(data.groupby(x)['Rating'].mean().sort_values(ascending=False))
    print('')

In [None]:
data['Room Score Index'] = avg_rating(data, 'Room Score')
data['Room Score Index'].value_counts()

In [None]:
df = data['Room Score Index'].value_counts().reset_index()

fig, ax = plt.subplots()

ax.pie(
    x=df['Room Score Index'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0, 0]
            )

plt.title('Hotel Room Score Index Distribution', fontsize=12);

In [None]:
df = data.groupby('Location')['Room Score'].mean().reset_index()
df['Room Score Index'] = avg_rating(df, 'Room Score')
df

In [None]:
df = df['Room Score Index'].value_counts().reset_index()

fig, ax = plt.subplots()

ax.pie(
    x=df['Room Score Index'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=[0.08, 0, 0, 0]
            )

plt.title('Location of Avg Room Score Index Distribution', fontsize=12);

#45% Location has the avg room score between 8.5 - 9.5

In [None]:
for x in ['Review Score', 'Bed Count']:
    df = data.groupby(x)['Room Score'].mean().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(18, 4))
    sns.lineplot(data=df, x=x, y='Room Score', marker='o')
    plt.title(f'Avg Room Score for Different {x}', fontsize=12)

In [None]:
#Room Price analysis

In [None]:
sns.histplot(data['Room Price'])

In [None]:
for x in ['Location', 'Review Score', 'Bed Count']:
    print('==============')
    print(data.groupby(x)['Room Price'].mean().sort_values(ascending=False))
    print('')

In [None]:
#Top 20 Most Expensive Hotels 
df=data.loc[:, ["Hotel Name", "Room Price"]].sort_values('Room Price',ascending=False)[:20]
plt.figure(figsize=(16, 8))
sns.barplot(data=df, x='Room Price', y='Hotel Name')
plt.axvline(data['Room Price'].mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Room Price'])
plt.title('Top 20 Most Expensive Hotels');

In [None]:
for x in ['Location', 'Review Score', 'Bed Count']:
    print('==============')
    print(data.groupby(x)['Room Price'].mean().sort_values(ascending=False))
    print('')

In [None]:
data['Room Price'].max()

In [None]:
data['Room Price'].min()

In [None]:
def price(data, feature):
    blanks=[]
    for x in data[feature]:
        if 0 <= x <= 100:
            blanks.append('0-100')
        elif 100 <= x < 1000:
            blanks.append('100-1000')
        elif 1000 <= x < 5000:
            blanks.append('1000-5000')
        elif 5000 <= x < 10000:
            blanks.append('5000-10000')
        elif 10000 <= x < 20000:
            blanks.append('10000-20000')
        elif 20000 <= x < 30000:
            blanks.append('20000-30000')
        elif 30000 <= x < 40000:
            blanks.append('30000-40000')
        else:
            blanks.append('40000+')
    return blanks

In [None]:
data['Price Index'] = price(data, 'Room Price')
data['Price Index'].value_counts()

In [None]:
df = data['Price Index'].value_counts().reset_index()
plt.figure(figsize=(16, 8))

sns.barplot(data=df, x='index', y='Price Index')

plt.title('Price Index Distribution in Data', fontsize=12);

In [None]:
# Bar charts are used both above and below are because some data 
# such as '0-100' is too little to be show in a pie chart

In [None]:
df = data.groupby('Location')['Room Price'].mean().reset_index()
df['Room Price Index'] = price(df, 'Room Price')
df

In [None]:
df['Room Price Index'].value_counts(normalize=True)

In [None]:
df = df['Room Price Index'].value_counts().reset_index()
plt.figure(figsize=(16, 8))

sns.barplot(data=df, x='index', y='Room Price Index')
plt.title('Location of Avg Room Price Index Distribution', fontsize=12);

In [None]:
for x in ['Review Score', 'Bed Count']:
    df = data.groupby(x)['Room Price'].mean().sort_values(ascending=False).reset_index()
    plt.figure(figsize=(18, 4))
    sns.lineplot(data=df, x=x, y='Room Price', marker='o')
    plt.title(f'Avg Room Price for Different {x}', fontsize=12)

In [None]:
# Correlation 

In [None]:
data.corr()

In [None]:
sns.pairplot(data)

In [None]:
sns.heatmap(data.corr(), annot=True)

In [None]:
data