# <div style="text-align: center; background-color: #595964; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">📊EDA |Hotel Price Data | Visualization </div>

<h3 style="text-align: left;background-color: #00BFFF; font-family:Times New Roman; color: white; padding: 14px; line-height: 1; border-radius:10px"> About Dataset📁</h3>

<h4>Hotel Price Data Dataset contains <mark>10 columns</mark>, each are:</h4>


* <b> <mark>1. Hotel Name</mark></b>
* <b> <mark>2. Rating</mark></b>
* <b> <mark>3.  Rating Description</mark></b>
* <b> <mark>4. Reviews</mark></b>
* <b> <mark>5. Star rating</mark></b>
* <b> <mark>6. Location</mark></b>
* <b> <mark>7. Nearest Landmark</mark></b> 
* <b> <mark>8. Distance to the Landmark</mark></b>
* <b> <mark>9. Price </mark></b>
* <b> <mark>10. Tax</mark></b>

<h4>

<a id="1"></a>
# <div style="text-align: center; background-color: #00BFFF; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">1. Import Necessary Libraries</div>

In [None]:
!pip install ydata-profiling

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import missingno as mno
import plotly.offline as pyo 
import plotly.figure_factory as ff
import plotly.io as pio
from wordcloud import WordCloud
color_pal = sns.color_palette()
plt.style.use('seaborn-dark-palette')
plt.style.use('dark_background')

import nltk
import re

import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

#Model
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# Loading the data:
file_path = '/kaggle/input/hotel-price-data-of-cities-in-india-makemytrip/bangalore.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
cols = df.columns
cols    

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df.describe(include = 'object').T

In [None]:
df.info()

In [None]:
unique_values = df.nunique()
unique_values

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">3. Null values</div>

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind ='bar')

In [None]:
# Bar plot:
plt.figure(figsize=(20,4))
sns.barplot(df.isna())
plt.show()

In [None]:
# Remove commas from 'Price' column and convert to float
df['Price'] = df['Price'].str.replace(',', '').astype(float)

# Remove commas from 'Tax' column and convert to float
df['Tax'] = df['Tax'].str.replace(',', '').astype(float)

In [None]:
# Fill missing values in the 'Rating' column with the mode
df['Rating'].fillna(df['Rating'].mode()[0], inplace=True)

# Fill missing values in the 'Rating Description' column with 'Unknown'
df['Rating Description'].fillna('Unknown', inplace=True)

# Fill missing values in the 'Reviews' column with the mode
df['Reviews'].fillna(df['Reviews'].mode()[0], inplace=True)

# Fill missing values in the 'Star Rating' column with the median
median_Star_Rating = df['Star Rating'].median()
df['Star Rating'].fillna(median_Star_Rating, inplace=True)

# Fill missing values in the 'Tax' column with the mean
mean_Tax = df['Tax'].mean()
df['Tax'].fillna(mean_Tax, inplace=True)

# Drop the columns from the DataFrame
df.drop(columns=['Nearest Landmark', 'Distance to Landmark'], inplace=True)

# Clean the 'Hotel Name' values by removing commas, forward slashes, hyphens, periods, and backslashes
df['Hotel Name'] = df['Hotel Name'].apply(lambda x: re.sub(r'[,/.\\-]', '', x))

# Display the shape of the DataFrame after changes
print(df.shape)


# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">4. Duplicate rows</div>

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">5. Feature engineering</div>

In [None]:
# Renaming the Columns
df.rename(columns ={'Rating Description':'Rating_Description',
                    'Star Rating':'Star_Rating',
                    'Nearest Landmark':'Nearest_Landmark',
                    'Distance to Landmark': 'Distance_to_Landmark'}, inplace = True)
df.head()

In [None]:
# Add 'Price' and 'Tax' columns element-wise and update 'Price' column
df['Price'] = df['Price'] + df['Tax']

# Drop the columns from the DataFrame
df.drop(columns=['Tax'], inplace=True)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">6. Data visualisation</div>

In [None]:
# Import the ProfileReport class from the ydata_profiling library
from ydata_profiling import ProfileReport

# Create a comprehensive profile report for the DataFrame 'df'
# This report will contain various statistics, insights, and visualizations about the data
profile = ProfileReport(df)

In [None]:
profile                     # or this one
profile.to_notebook_iframe() # use this line to show the output

In [None]:
locationLocation_of_Studnet = df.groupby('Location')['Price'].sum().reset_index()

# Create a line plot using Plotly Express
# x-axis: locationLocation, y-axis: Location ,
# labels for the x-axis, title, and customized height
locationLocation_of_Studnet = px.line(
    locationLocation_of_Studnet,  # DataFrame containing the data
    x='Location',   # x-values
    y='Price',  # y-values
    title='Price  by location',  # Set the title of the plot
    height=800  # Set the height of the plot
)

# Display the plot
locationLocation_of_Studnet.show()

In [None]:
# Calculate the value counts for each unique value in the 'employment_type' column
Description_grouped = df['Rating_Description'].value_counts()

# Define the desired order of employment types
R_Description = ['Very Good', 'Excellent', 'Good' ,'Unknown']

# Create a bar plot using Plotly Express
# x-axis: Employment types, y-axis: Count of each employment type,
# color-coded by employment type, and use a predefined color palette
fig = px.bar(
    x=R_Description,  # x-values: employment types
    y=Description_grouped.values,  # y-values: counts of each employment type
    color=Description_grouped.index,  # Color the bars based on employment type
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    template='plotly_dark',  # Use a dark template for the plot
    text=Description_grouped.values  # Display the count values on top of the bars
)

# Update the layout and appearance of the plot
fig.update_layout(
    title_text='Rating  Type Distribution',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Rating Description",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic"),  # Set the font size and family for the text
)

# Adjust the width of the bars
fig.update_traces(width=0.5)

# Display the plot
fig.show()


In [None]:
# x-axis: "Rating_Description", y-axis: "Price", color-coded by "Star_Rating"
fig = px.bar(df, x="Star_Rating", y="Price", color="Star_Rating")

# Update the layout of the plot
fig.update_layout(
    title_text='Price Distribution with Star_Rating',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Star_Rating",  # Label for the x-axis
    yaxis_title="Price",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# x-axis: "Rating_Description", y-axis: "Price", color-coded by "Star_Rating"
fig = px.bar(df, x="Rating", y="Price", color="Star_Rating")

# Update the layout of the plot
fig.update_layout(
    title_text='Price Distribution with Rating',  # Set the title of the plot
    height=650,  # Set the height of the plot
    xaxis_title="Rating",  # Label for the x-axis
    yaxis_title="Price",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# Select the top 5 Price in the DataFrame
top10rice = df['Price'].sort_values(ascending=False).head(10)

# Create a bar plot using Plotly Express
# y-axis: Price values, x-axis: Index of the top 5 Price,
# color-coded by the index, and using a color palette
fig = px.bar(
    y=top10rice.values,  # y-values: top 5 salaries
    x=top10rice.index,   # x-values: indices of the top 5 salaries
    color=top10rice.index,  # Color the bars based on the indices
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=top10rice.values,  # Display the salary values on top of the bars
    title='Top 10Price',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    title_text='Price Distribution',  # Set the title of the plot
    height=700,#Set the height of the plot
    xaxis_title="Price",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# Calculate the value counts for each unique value in the 'job_title' column
top10_Location = df['Location'].value_counts()[:10]

# Create a bar plot using Plotly Express
# y-axis: Count of each job title, x-axis: Location,
# color-coded by job titles and using a color palette
fig = px.bar(
    y=top10_Location.values,  # y-values: count of each job title
    x=top10_Location.index,   # x-values: job titles
    color=top10_Location.index,  # Color the bars based on job titles
    color_discrete_sequence=px.colors.sequential.PuBuGn,  # Set color palette
    text=top10_Location.values,  # Display the count values on top of the bars
    title='Top 10 Location',  # Set the title of the plot
    template='plotly_dark'  # Use a dark template for the plot
)

# Update the layout of the plot
fig.update_layout(
    title_text='Location Distribution',  # Set the title of the plot
    height=700,  # Set the height of the plot
    xaxis_title="Location",  # Label for the x-axis
    yaxis_title="Count",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# x-axis: "work_year", y-axis: "salary_in_usd", color-coded by "experience_level",
# and create a notched box plot
fig = px.box(df, x="Star_Rating", y="Price", color="Star_Rating", notched=True)

# Update the layout of the plot
fig.update_layout(
    title_text='Price Box Distribution',  # Set the title of the plot
    height=500,  # Set the height of the plot
    xaxis_title="Star_Rating",  # Label for the x-axis
    yaxis_title="Price",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# x-axis: "Hotel Name", y-axis: "Price", color-coded by "Star_Rating"
fig = px.scatter(df, x="Hotel Name", y="Price", color="Star_Rating")

# Update the layout of the plot
fig.update_layout(
    title_text='Price Job Distribution',  # Set the title of the plot
    height=1200,  # Set the height of the plot
    xaxis_title="Hotel Name",  # Label for the x-axis
    yaxis_title="Price",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# x-axis: "Hotel Name", y-axis: "Reviews", color-coded by "Star_Rating"
fig = px.scatter(df, x="Hotel Name", y="Reviews", color="Star_Rating")

# Update the layout of the plot
fig.update_layout(
    title_text='Reviews Job Distribution',  # Set the title of the plot
    height=1200,  # Set the height of the plot
    xaxis_title="Hotel Name",  # Label for the x-axis
    yaxis_title="Reviews",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# x-axis: "Price", color-coded by "Star_Rating", and display additional data on hover
fig = px.histogram(df, x='Price', color="Star_Rating", hover_data=df.columns)

# Update the layout of the plot
fig.update_layout(
    title_text='Price Histogram Distribution',  # Set the title of the plot
    height=600,  # Set the height of the plot
    xaxis_title="Price",  # Label for the x-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()


In [None]:
# Calculate the average salary for each Hotel Name and sort in descending order
average_Price = df.groupby('Hotel Name')['Price'].mean().reset_index()
average_Price = average_Price.sort_values(by='Price', ascending=False)

# Select the top 10 Hotel Name with the highest average price
top_10_Price = average_Price.head(10)

# Create a new figure using Plotly
fig = go.Figure()

# Add a bar trace to the figure
fig.add_trace(go.Bar(
    x=top_10_Price['Hotel Name'],  # x-values: Hotel Name
    y=top_10_Price['Price']  # y-values: average Price
))

# Update the layout and appearance of the plot
fig.update_layout(
    title='Top 10 Hotel by Price',  # Set the title of the plot
    xaxis_title='Hotel Name',  # Label for the x-axis
    yaxis_title='Price',  # Label for the y-axis
    template='plotly_white',  # Use a white template for the plot
    font=dict(color='black'),  # Set font color to white
    height=800  # Set the height of the plot
)

# Initialize Plotly for notebook integration
pyo.init_notebook_mode(connected=True)

# Display the plot
pyo.iplot(fig)


In [None]:
# Calculate the average salary for each Hotel Name and sort in descending order
average_Reviews = df.groupby('Hotel Name')['Reviews'].mean().reset_index()
average_Reviews = average_Reviews.sort_values(by='Reviews', ascending=False)

# Select the top 10 Hotel Name with the highest average price
top_10_Reviews = average_Reviews.head(10)

# Create a new figure using Plotly
fig = go.Figure()

# Add a bar trace to the figure
fig.add_trace(go.Bar(
    x=top_10_Reviews['Hotel Name'],  # x-values: Hotel Name
    y=top_10_Reviews['Reviews']  # y-values: Reviews Price
))

# Update the layout and appearance of the plot
fig.update_layout(
    title='Top 10 Hotel by Reviews',  # Set the title of the plot
    xaxis_title='Hotel Name',  # Label for the x-axis
    yaxis_title='Reviews',  # Label for the y-axis
    template='plotly_white',  # Use a white template for the plot
    font=dict(color='black'),  # Set font color to white
    height=900  # Set the height of the plot
)

# Initialize Plotly for notebook integration
pyo.init_notebook_mode(connected=True)

# Display the plot
pyo.iplot(fig)

In [None]:
# x-axis: "Hotel Name", y-axis: "Price", color-coded by "Rating"
fig = px.bar(df, x="Hotel Name", y="Price", color="Price")

# Update the layout of the plot
fig.update_layout(
    title_text='Hotel Name Distribution with Price ',  # Set the title of the plot
    height=1200,  # Set the height of the plot
    xaxis_title="Hotel Name",  # Label for the x-axis
    yaxis_title="Price",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# x-axis: "Hotel Name", y-axis: "Rating", color-coded by "Rating"
fig = px.bar(df, x="Hotel Name", y="Rating", color="Price")

# Update the layout of the plot
fig.update_layout(
    title_text='Hotel Name Distribution with Rating ',  # Set the title of the plot
    height=1200,  # Set the height of the plot
    xaxis_title="Hotel Name",  # Label for the x-axis
    yaxis_title="Rating",  # Label for the y-axis
    font=dict(size=17, family="Franklin Gothic")  # Set the font size and family for the text
)

# Display the plot
fig.show()

In [None]:
# Sunburst chart for the distribution of Rating_Description
fig2 = px.sunburst(df, path=['Rating_Description'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Rating_Description', height=500)
fig2.show()

In [None]:
# Sunburst chart for the distribution of Rating
fig2 = px.sunburst(df, path=['Rating'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Rating', height=500)
fig2.show()

In [None]:
sns.pairplot(df)

In [None]:
def Freq_df(cleanwordlist):
    Freq_dist_nltk = nltk.FreqDist(cleanwordlist)
    df_freq = pd.DataFrame.from_dict(Freq_dist_nltk, orient='index')
    df_freq.columns = ['Frequency']
    df_freq.index.name = 'Term'
    df_freq = df_freq.sort_values(by=['Frequency'],ascending=False)
    df_freq = df_freq.reset_index()
    return df_freq

def Word_Cloud(data, color_background, colormap, title):
    plt.figure(figsize = (20,15))
    wc = WordCloud(width=1200, 
               height=600, 
               max_words=50,
               colormap= colormap,
               max_font_size = 100,
               random_state=88, 
               background_color=color_background).generate_from_frequencies(data)
    plt.imshow(wc, interpolation='bilinear')
    plt.title(title, fontsize=20)
    plt.axis('off')
    plt.show()

In [None]:
freq_df = Freq_df(df['Location'].values.tolist())
data = dict(zip(freq_df['Term'].tolist(), freq_df['Frequency'].tolist()))
data = freq_df.set_index('Term').to_dict()['Frequency']

Word_Cloud(data ,'black','RdBu', 'WordCloud of Hotel Name')

In [None]:
# Box Plot
# Prepare data for the box plot
hist_data = [df['Price']]
group_labels = ['Price']

# Create the box plot using Plotly Express
fig1 = px.box(y=df['Price'], template='plotly_dark', title='Price (BoxPlot)')

# Customize the layout of the box plot
fig1.update_layout(font=dict(size=17, family="Franklin Gothic"))

# Display the box plot
fig1.show()

# Dist Plot
# Prepare data for the distribution plot
hist_data = [df['Price']]
group_labels = ['Price']

# Create the distribution plot using Plotly Figure Factory
fig2 = ff.create_distplot(hist_data, group_labels, show_hist=False)
fig2.layout.template = 'plotly_dark'

# Customize the layout of the distribution plot
fig2.update_layout(title='Price (DistPlot)', font=dict(size=17, family="Franklin Gothic"))

# Display the distribution plot
fig2.show()


# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">7. Categorical</div>

In [None]:
# Select columns with object (categorical) data types
num_cols = df.select_dtypes(include='object').columns.tolist()

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to the selected numerical columns
for x in num_cols:  
    df[x] = le.fit_transform(df[x])

# Now, your categorical columns (excluding column 0) have been converted to numerical values
df.head()

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">8. Corr Matrix
</div>

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the To Price')
Corr_Matrix['Price'].sort_values(ascending=False).head(5)

In [None]:
print('Top 5 Most Negatively Correlated to Price ')
Corr_Matrix['Price'].sort_values(ascending=True).head(5)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">9. spliting the dataset

</div>


In [None]:
X = df.drop(columns=['Price','Rating_Description'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">10. Model Building and Analysis

</div>

In [None]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual_salary'] = y_test
    submit['Predict_salary'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(submit.head(5))

    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")


# <div style="text-align: center; background-color: #6495ED; font-family:Times New Roman; color: white; padding: 14px; line-height: 1;border-radius:20px">11. feature importances

</div>


In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='#6495ED')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
y_pred= model.predict(X_test)

# Residuals
residuals = y_test - y_pred

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()