# List of Hotels in Ireland 


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re

In [2]:
data = pd.read_csv("booking.csv", encoding = 'UTF-8')

In [3]:
data.head()

Unnamed: 0,hotel name,city,certification,score,review rate,reviews,room type,occupancy,Free cancellation,pay at the property,No prepayment needed,rooms left,nights/adults,price,location rate,bb included
0,"Radisson Blu Hotel, Athlone",Athlone,Sustainability certification,7.9,Good,"6,092 reviews",Standard Room,Beds: 1 double or 2 twins,Free cancellation,– pay at the property,No prepayment needed,Only 5 rooms left at this price on our site,"1 night, 2 adults",€ 259,,
1,Sheraton Athlone Hotel,Athlone,Sustainability certification,8.7,Excellent,"5,418 reviews",Classic King Room,1 king bed,Free cancellation,,,Only 5 rooms left at this price on our site,"1 night, 2 adults",€ 275,Location 9.4,
2,Clayton Hotel Dublin Airport,Cloghran,Sustainability certification,8.3,Very Good,"23,489 reviews",Deluxe Double & Single Room,"2 beds (1 twin, 1 full)",Free cancellation,,,,"1 night, 2 adults",€ 259,,
3,Lawlors Hotel,Naas,,8.8,Excellent,"2,379 reviews",Twin Room,2 full beds,Free cancellation,– pay at the property,No prepayment needed,Only 2 rooms left at this price on our site,"1 night, 2 adults",€ 329,Location 9.4,
4,The Glendalough Hotel,Laragh,,7.1,Good,989 reviews,Double or Twin Room,Beds: 1 double or 2 twins,,,,Only 4 rooms left at this price on our site,"1 night, 2 adults",€ 220,Location 9.5,Breakfast included


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   hotel name            501 non-null    object 
 1   city                  501 non-null    object 
 2   certification         136 non-null    object 
 3   score                 500 non-null    float64
 4   review rate           500 non-null    object 
 5   reviews               500 non-null    object 
 6   room type             501 non-null    object 
 7   occupancy             501 non-null    object 
 8   Free cancellation     296 non-null    object 
 9   pay at the property   180 non-null    object 
 10  No prepayment needed  180 non-null    object 
 11  rooms left            337 non-null    object 
 12  nights/adults         501 non-null    object 
 13  price                 501 non-null    object 
 14  location rate         158 non-null    object 
 15  bb included           1

### Insights from .info()

- A lot of features with null values.
- change data type of columns 'reviews', 'rooms left', 'price'.
- drop columns 'certification', 'nights/adults','pay at the property'.
- 'Free cancellation','No prepayment needed', 'bb included' are binary features.


### Step-by-step list

- Checking for duplicates rows.
- Checking Null rows or rows that won't have enough information.
- Dropping columns that won't be used.
- Converting 'reviews' into numeric.
- Cleaning 'location rate' feature.
- Removing € sign from 'price' and converting into float.
- Cleaning categorical features. Filling null values with information.
- Cleaning 'rooms left' and converting into numeric value.
- Save file

### Checking for duplicates

In [5]:
duplicates = data[data.duplicated]

data.drop_duplicates(inplace=True)

### Checking for entire rows with null values

In [6]:
all_null_rows = data[data.isnull().all(axis=1)]
data.dropna(axis=0, how='all', inplace = True)

### Dropping columns that won't be used.

In [7]:
print("\n")
print(data['certification'].value_counts())
print("\n")
print(data['nights/adults'].value_counts())



certification
Sustainability certification    91
•  •  •  •                       7
•  •  •                          5
Managed by a private host        4
•  •                             3
•  •  •  •  •                    2
Name: count, dtype: int64


nights/adults
1 night, 2 adults    451
Name: count, dtype: int64


In [8]:
data.drop(columns=['certification'], inplace = True)
data.drop(columns=['nights/adults'], inplace = True)
data.drop(columns=['pay at the property'], inplace = True)

### Converting 'reviews' into numeric.

In [9]:
rows_with_nan_reviews = data[data['reviews'].isna()]
rows_with_nan_reviews

Unnamed: 0,hotel name,city,score,review rate,reviews,room type,occupancy,Free cancellation,No prepayment needed,rooms left,price,location rate,bb included
492,The Marcy Boutique Accommodation,Drogheda,,,,Double Room,1 full bed,,,Only 5 rooms left at this price on our site,€ 179,,


In [10]:
data.drop(data[data['hotel name'] == 'The Marcy Boutique Accommodation'].index, inplace=True)

In [11]:
data['reviews'] = data['reviews'].str.replace('reviews', '')

In [12]:
data['reviews'] = data['reviews'].str.replace(',', '').str.strip().astype(int)

In [13]:
data.head()

Unnamed: 0,hotel name,city,score,review rate,reviews,room type,occupancy,Free cancellation,No prepayment needed,rooms left,price,location rate,bb included
0,"Radisson Blu Hotel, Athlone",Athlone,7.9,Good,6092,Standard Room,Beds: 1 double or 2 twins,Free cancellation,No prepayment needed,Only 5 rooms left at this price on our site,€ 259,,
1,Sheraton Athlone Hotel,Athlone,8.7,Excellent,5418,Classic King Room,1 king bed,Free cancellation,,Only 5 rooms left at this price on our site,€ 275,Location 9.4,
2,Clayton Hotel Dublin Airport,Cloghran,8.3,Very Good,23489,Deluxe Double & Single Room,"2 beds (1 twin, 1 full)",Free cancellation,,,€ 259,,
3,Lawlors Hotel,Naas,8.8,Excellent,2379,Twin Room,2 full beds,Free cancellation,No prepayment needed,Only 2 rooms left at this price on our site,€ 329,Location 9.4,
4,The Glendalough Hotel,Laragh,7.1,Good,989,Double or Twin Room,Beds: 1 double or 2 twins,,,Only 4 rooms left at this price on our site,€ 220,Location 9.5,Breakfast included


### Cleaning 'location rate' feature

In [14]:
print("\n")
print(data['location rate'].value_counts())



location rate
Location 9.6    35
Location 9.4    34
Location 9.3    24
Location 9.5    21
Location 9.7    18
Location 9.8    10
Name: count, dtype: int64


In [15]:
data['location rate'] = data['location rate'].str.replace('Location', '')

In [16]:
data['location rate'] = data['location rate'].fillna('No rated')

### Removing € sign from 'price' and converting into float.

In [17]:
data['price'] = data['price'].str.replace('€', '')

data['price'] = data['price'].str.replace('\xa0', '').str.replace(',', '').astype(float)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 450 entries, 0 to 500
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   hotel name            450 non-null    object 
 1   city                  450 non-null    object 
 2   score                 450 non-null    float64
 3   review rate           450 non-null    object 
 4   reviews               450 non-null    int32  
 5   room type             450 non-null    object 
 6   occupancy             450 non-null    object 
 7   Free cancellation     266 non-null    object 
 8   No prepayment needed  162 non-null    object 
 9   rooms left            308 non-null    object 
 10  price                 450 non-null    float64
 11  location rate         450 non-null    object 
 12  bb included           161 non-null    object 
dtypes: float64(2), int32(1), object(10)
memory usage: 47.5+ KB


### Cleaning categorical features. Filling null values with information.

In [19]:
print(data['review rate'].value_counts())
print(data['review rate'].unique())

print("\n")
print(data['Free cancellation'].value_counts())
print(data['Free cancellation'].unique())


print("\n")
print(data['No prepayment needed'].value_counts())
print(data['No prepayment needed'].unique())

print("\n")
print(data['bb included'].value_counts())
print(data['bb included'].unique())


review rate
Very Good       193
Excellent       125
Good             73
Wonderful        46
Review score      9
Exceptional       4
Name: count, dtype: int64
['Good' 'Excellent' 'Very Good' 'Wonderful' 'Review score' 'Exceptional']


Free cancellation
Free cancellation    266
Name: count, dtype: int64
['Free cancellation' nan]


No prepayment needed
No prepayment needed    162
Name: count, dtype: int64
['No prepayment needed' nan]


bb included
Breakfast included    161
Name: count, dtype: int64
[nan 'Breakfast included']


In [20]:
data['review rate'] = data['review rate'].str.replace('Review score', 'Not Good')

data['Free cancellation'] = data['Free cancellation'].fillna('Check Policy')

data['No prepayment needed'] = data['No prepayment needed'].fillna('Prepayment needed')

data['bb included'] = data['bb included'].fillna('Room Only')

### Cleaning 'rooms left' and converting into numeric value.

In [21]:
print(data['rooms left'].value_counts())

print("\n")
print(data['rooms left'].unique())

rooms left
Only 1 room left at this price on our site     87
Only 4 rooms left at this price on our site    48
Only 2 rooms left at this price on our site    47
Only 3 rooms left at this price on our site    44
Only 5 rooms left at this price on our site    31
Only 6 rooms left at this price on our site    24
Only 1 left at this price on our site          14
Only 7 rooms left at this price on our site     9
Only 2 left at this price on our site           2
Only 3 left at this price on our site           1
Only 6 left at this price on our site           1
Name: count, dtype: int64


['Only 5 rooms left at this price on our site' nan
 'Only 2 rooms left at this price on our site'
 'Only 4 rooms left at this price on our site'
 'Only 1 room left at this price on our site'
 'Only 3 rooms left at this price on our site'
 'Only 6 rooms left at this price on our site'
 'Only 1 left at this price on our site'
 'Only 7 rooms left at this price on our site'
 'Only 3 left at this price on our sit

In [22]:
data['rooms left'] = data['rooms left'].fillna('0') # Means there are no rooms available

data['rooms left'] = data['rooms left'].apply(lambda x: re.findall(r'\d+', str(x)))

data['rooms left'] = data['rooms left'].apply(lambda x: ','.join(x))

data['rooms left'] = pd.to_numeric(data['rooms left'])

In [23]:
print(data['rooms left'].value_counts())

rooms left
0    142
1    101
2     49
4     48
3     45
5     31
6     25
7      9
Name: count, dtype: int64


### Data is cleaned

In [24]:
data.head()

Unnamed: 0,hotel name,city,score,review rate,reviews,room type,occupancy,Free cancellation,No prepayment needed,rooms left,price,location rate,bb included
0,"Radisson Blu Hotel, Athlone",Athlone,7.9,Good,6092,Standard Room,Beds: 1 double or 2 twins,Free cancellation,No prepayment needed,5,259.0,No rated,Room Only
1,Sheraton Athlone Hotel,Athlone,8.7,Excellent,5418,Classic King Room,1 king bed,Free cancellation,Prepayment needed,5,275.0,9.4,Room Only
2,Clayton Hotel Dublin Airport,Cloghran,8.3,Very Good,23489,Deluxe Double & Single Room,"2 beds (1 twin, 1 full)",Free cancellation,Prepayment needed,0,259.0,No rated,Room Only
3,Lawlors Hotel,Naas,8.8,Excellent,2379,Twin Room,2 full beds,Free cancellation,No prepayment needed,2,329.0,9.4,Room Only
4,The Glendalough Hotel,Laragh,7.1,Good,989,Double or Twin Room,Beds: 1 double or 2 twins,Check Policy,Prepayment needed,4,220.0,9.5,Breakfast included


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 450 entries, 0 to 500
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   hotel name            450 non-null    object 
 1   city                  450 non-null    object 
 2   score                 450 non-null    float64
 3   review rate           450 non-null    object 
 4   reviews               450 non-null    int32  
 5   room type             450 non-null    object 
 6   occupancy             450 non-null    object 
 7   Free cancellation     450 non-null    object 
 8   No prepayment needed  450 non-null    object 
 9   rooms left            450 non-null    int64  
 10  price                 450 non-null    float64
 11  location rate         450 non-null    object 
 12  bb included           450 non-null    object 
dtypes: float64(2), int32(1), int64(1), object(9)
memory usage: 47.5+ KB


### Saving csv file

In [26]:
data.to_csv('booking_cleaned.csv')