In [2]:
import pandas as pd
df = pd.read_csv('hotel_bookings.csv')

In [3]:
# Display the first few rows and summary information of the dataset
print(df.head())
print(df.info())
print(df.describe())

          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0        342               2015               July   
1  Resort Hotel            0        737               2015               July   
2  Resort Hotel            0          7               2015               July   
3  Resort Hotel            0         13               2015               July   
4  Resort Hotel            0         14               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                        27                          1   
1                        27                          1   
2                        27                          1   
3                        27                          1   
4                        27                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
0                        0                     0       2  ...    No Deposit   
1     

In [4]:
#Identifying missing values in dataframe
missing_values = df.isnull().sum()
print(missing_values)

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [6]:
#drop rows with missing values 
df_dropped_rows = df.dropna()
print(df_dropped_rows)


               hotel  is_canceled  lead_time  arrival_date_year  \
2392    Resort Hotel            0          6               2015   
2697    Resort Hotel            0         24               2015   
2867    Resort Hotel            0         24               2015   
2877    Resort Hotel            0         24               2015   
2878    Resort Hotel            0         24               2015   
...              ...          ...        ...                ...   
112499    City Hotel            0         13               2017   
113046    City Hotel            0         13               2017   
113082    City Hotel            0         13               2017   
113627    City Hotel            0        210               2017   
116451    City Hotel            0        191               2017   

       arrival_date_month  arrival_date_week_number  \
2392              October                        42   
2697              October                        44   
2867             November     

In [7]:
#drop columns with any missing values
df_dropped_columns = df.dropna(axis=1)
print(df_dropped_columns)

               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0        342               2015   
1       Resort Hotel            0        737               2015   
2       Resort Hotel            0          7               2015   
3       Resort Hotel            0         13               2015   
4       Resort Hotel            0         14               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0         23               2017   
119386    City Hotel            0        102               2017   
119387    City Hotel            0         34               2017   
119388    City Hotel            0        109               2017   
119389    City Hotel            0        205               2017   

       arrival_date_month  arrival_date_week_number  \
0                    July                        27   
1                    July                        27   
2                    July     

In [8]:
#filling missing values with specific values 
df_filled_value = df.fillna(0)
print(df_filled_value)

               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0        342               2015   
1       Resort Hotel            0        737               2015   
2       Resort Hotel            0          7               2015   
3       Resort Hotel            0         13               2015   
4       Resort Hotel            0         14               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0         23               2017   
119386    City Hotel            0        102               2017   
119387    City Hotel            0         34               2017   
119388    City Hotel            0        109               2017   
119389    City Hotel            0        205               2017   

       arrival_date_month  arrival_date_week_number  \
0                    July                        27   
1                    July                        27   
2                    July     

In [10]:
# Fill missing values using forward fill and backward fill methods.
df_ffill = df.ffill()
df_bfill = df.bfill()
print(df_ffill)
print(df_bfill)


               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0        342               2015   
1       Resort Hotel            0        737               2015   
2       Resort Hotel            0          7               2015   
3       Resort Hotel            0         13               2015   
4       Resort Hotel            0         14               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0         23               2017   
119386    City Hotel            0        102               2017   
119387    City Hotel            0         34               2017   
119388    City Hotel            0        109               2017   
119389    City Hotel            0        205               2017   

       arrival_date_month  arrival_date_week_number  \
0                    July                        27   
1                    July                        27   
2                    July     

In [11]:
#Interpolate missing values.
df_interpolated = df.interpolate()
print(df_interpolated)


  df_interpolated = df.interpolate()


               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0        342               2015   
1       Resort Hotel            0        737               2015   
2       Resort Hotel            0          7               2015   
3       Resort Hotel            0         13               2015   
4       Resort Hotel            0         14               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0         23               2017   
119386    City Hotel            0        102               2017   
119387    City Hotel            0         34               2017   
119388    City Hotel            0        109               2017   
119389    City Hotel            0        205               2017   

       arrival_date_month  arrival_date_week_number  \
0                    July                        27   
1                    July                        27   
2                    July     

In [13]:
#Convert a column to a different data type.
df['arrival_date_week_number'] = df['arrival_date_week_number'].astype('float')
print(df.dtypes)


hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number          float64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [14]:
#Apply a function to transform the values of a column.
df['adults'] = df['adults'].apply(lambda x: x * 2)
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0        342               2015               July   
1  Resort Hotel            0        737               2015               July   
2  Resort Hotel            0          7               2015               July   
3  Resort Hotel            0         13               2015               July   
4  Resort Hotel            0         14               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                      27.0                          1   
1                      27.0                          1   
2                      27.0                          1   
3                      27.0                          1   
4                      27.0                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
0                        0                     0       4  ...    No Deposit   
1     

In [17]:
#Normalize a column using Min-Max scalin
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['lead_time'] = scaler.fit_transform(df[['lead_time']])
print(df.head())

          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015               July   
1  Resort Hotel            0   1.000000               2015               July   
2  Resort Hotel            0   0.009498               2015               July   
3  Resort Hotel            0   0.017639               2015               July   
4  Resort Hotel            0   0.018996               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                      27.0                          1   
1                      27.0                          1   
2                      27.0                          1   
3                      27.0                          1   
4                      27.0                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  agent  company  \
0                        0                     0       4  ...    NaN      NaN   
1 

In [18]:
#Standardize a column (z-score normalization).
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['arrival_date_week_number'] = scaler.fit_transform(df[['arrival_date_week_number']])
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015               July   
1  Resort Hotel            0   1.000000               2015               July   
2  Resort Hotel            0   0.009498               2015               July   
3  Resort Hotel            0   0.017639               2015               July   
4  Resort Hotel            0   0.018996               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  agent  company  \
0                        0                     0       4  ...    NaN      NaN   
1 

In [19]:
#Identify duplicate rows in the DataFrame.
duplicates = df.duplicated()
print(duplicates)


0         False
1         False
2         False
3         False
4         False
          ...  
119385    False
119386    False
119387    False
119388    False
119389    False
Length: 119390, dtype: bool


In [20]:
#Drop duplicate rows
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)


               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0   0.464043               2015   
1       Resort Hotel            0   1.000000               2015   
2       Resort Hotel            0   0.009498               2015   
3       Resort Hotel            0   0.017639               2015   
4       Resort Hotel            0   0.018996               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0   0.031208               2017   
119386    City Hotel            0   0.138399               2017   
119387    City Hotel            0   0.046133               2017   
119388    City Hotel            0   0.147897               2017   
119389    City Hotel            0   0.278155               2017   

       arrival_date_month  arrival_date_week_number  \
0                    July                 -0.012141   
1                    July                 -0.012141   
2                    July     

In [None]:
#Drop duplicate rows based on specific columns.
df_no_duplicates_specific = df.drop_duplicates(subset=['column1', 'column2'])
print(df_no_duplicates_specific)


In [21]:
#Convert all string values in a column to lowercase.
df['country'] = df['country'].str.lower()
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015               July   
1  Resort Hotel            0   1.000000               2015               July   
2  Resort Hotel            0   0.009498               2015               July   
3  Resort Hotel            0   0.017639               2015               July   
4  Resort Hotel            0   0.018996               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  agent  company  \
0                        0                     0       4  ...    NaN      NaN   
1 

In [22]:
#Remove leading and trailing spaces from string values in a column.
df['deposit_type'] = df['deposit_type'].str.strip()
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015               July   
1  Resort Hotel            0   1.000000               2015               July   
2  Resort Hotel            0   0.009498               2015               July   
3  Resort Hotel            0   0.017639               2015               July   
4  Resort Hotel            0   0.018996               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  agent  company  \
0                        0                     0       4  ...    NaN      NaN   
1 

In [24]:
#Replace a specific substring in a column with another substring.
df['arrival_date_month'] = df['arrival_date_month'].str.replace('July', 'August')
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015             August   
1  Resort Hotel            0   1.000000               2015             August   
2  Resort Hotel            0   0.009498               2015             August   
3  Resort Hotel            0   0.017639               2015             August   
4  Resort Hotel            0   0.018996               2015             August   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  agent  company  \
0                        0                     0       4  ...    NaN      NaN   
1 

In [25]:
#Extract a substring from each value in a column.
df['C'] = df['reservation_status'].str[0:5]
print(df.head())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015             August   
1  Resort Hotel            0   1.000000               2015             August   
2  Resort Hotel            0   0.009498               2015             August   
3  Resort Hotel            0   0.017639               2015             August   
4  Resort Hotel            0   0.018996               2015             August   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  company  \
0                        0                     0       4  ...      NaN   
1               

In [26]:
#Convert a column to datetime format.
df['date_column'] = pd.to_datetime(df['arrival_date_year'])
print(df.dtypes)


hotel                                     object
is_canceled                                int64
lead_time                                float64
arrival_date_year                          int64
arrival_date_month                        object
arrival_date_week_number                 float64
arrival_date_day_of_month                  int64
stays_in_weekend_nights                    int64
stays_in_week_nights                       int64
adults                                     int64
children                                 float64
babies                                     int64
meal                                      object
country                                   object
market_segment                            object
distribution_channel                      object
is_repeated_guest                          int64
previous_cancellations                     int64
previous_bookings_not_canceled             int64
reserved_room_type                        object
assigned_room_type  

In [27]:
#Extract year, month, and day from a datetime column.
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month
df['day'] = df['date_column'].dt.day
print(df.head())



          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015             August   
1  Resort Hotel            0   1.000000               2015             August   
2  Resort Hotel            0   0.009498               2015             August   
3  Resort Hotel            0   0.017639               2015             August   
4  Resort Hotel            0   0.018996               2015             August   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  \
0                        0                     0       4  ...   
1                        0        

In [28]:
#Filter rows based on a date range.
start_date = '7-1-2015'
end_date = '7-2-2015'
mask = (df['reservation_status_date'] >= start_date) & (df['reservation_status_date'] <= end_date)
df_filtered = df.loc[mask]
print(df_filtered)


Empty DataFrame
Columns: [hotel, is_canceled, lead_time, arrival_date_year, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, country, market_segment, distribution_channel, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, reserved_room_type, assigned_room_type, booking_changes, deposit_type, agent, company, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, reservation_status, reservation_status_date, normalized_column, C, date_column, year, month, day]
Index: []

[0 rows x 38 columns]


In [29]:
#Convert a categorical column to numerical using one-hot encoding.
df_one_hot_encoded = pd.get_dummies(df, columns=['market_segment'])
print(df_one_hot_encoded)


               hotel  is_canceled  lead_time  arrival_date_year  \
0       Resort Hotel            0   0.464043               2015   
1       Resort Hotel            0   1.000000               2015   
2       Resort Hotel            0   0.009498               2015   
3       Resort Hotel            0   0.017639               2015   
4       Resort Hotel            0   0.018996               2015   
...              ...          ...        ...                ...   
119385    City Hotel            0   0.031208               2017   
119386    City Hotel            0   0.138399               2017   
119387    City Hotel            0   0.046133               2017   
119388    City Hotel            0   0.147897               2017   
119389    City Hotel            0   0.278155               2017   

       arrival_date_month  arrival_date_week_number  \
0                  August                 -0.012141   
1                  August                 -0.012141   
2                  August     

In [30]:
#Convert a categorical column to numerical using label encoding.
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['market_segment'] = le.fit_transform(df['market_segment'])
print(df.head())



          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0   0.464043               2015             August   
1  Resort Hotel            0   1.000000               2015             August   
2  Resort Hotel            0   0.009498               2015             August   
3  Resort Hotel            0   0.017639               2015             August   
4  Resort Hotel            0   0.018996               2015             August   

   arrival_date_week_number  arrival_date_day_of_month  \
0                 -0.012141                          1   
1                 -0.012141                          1   
2                 -0.012141                          1   
3                 -0.012141                          1   
4                 -0.012141                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  \
0                        0                     0       4  ...   
1                        0        

In [None]:
#Group values in a categorical column and create a new column with grouped categories.
df['grouped_column'] = df['categorical_column'].map({'category1': 'group1', 'category2': 'group2'})
print(df.head())

In [None]:
#Merge two DataFrames based on a common column.
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
merged_df = pd.merge(df1, df2, on='key')
print(merged_df)


In [None]:
#Concatenate two DataFrames vertically.
df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'A': [4, 5, 6]})
df_vertical_concat = pd.concat([df1, df2], axis=0)
print(df_vertical_concat)


In [None]:
#Concatenate two DataFrames horizontally.
df1 = pd.DataFrame({'A': [1, 2, 3]})
df2 = pd.DataFrame({'B': [4, 5, 6]})
df_horizontal_concat = pd.concat([df1, df2], axis=1)
print(df_horizontal_concat)


In [None]:
#Create a new column based on existing columns.
df['new_column'] = df['column1'] + df['column2']
print(df.head())

In [None]:
#discretize a continuous column into bins.
df['binned_column'] = pd.cut(df['continuous_column'], bins=5)
print(df.head())


In [None]:
#Create polynomial features from existing numerical columns.
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
df_poly_features = poly.fit_transform(df[['numerical_column1', 'numerical_column2']])
print(df_poly_features)
