In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("./data/merged/merged_main.csv", encoding='utf-8')

# Select specific columns
df = df[['listing_link', 'title_bed_bats_review']]

# Duplicate the column
df['title_bed_bats_review - Copy'] = df['title_bed_bats_review']

# Replace "·" with ","
df['title_bed_bats_review - Copy'] = df['title_bed_bats_review - Copy'].str.replace('·', ',')

# Capitalize each word in the duplicated column
df['title_bed_bats_review - Copy'] = df['title_bed_bats_review - Copy'].str.title()

# Split the column by delimiter
split_cols = df['title_bed_bats_review - Copy'].str.split(',', expand=True)
split_cols.columns = [f"title_bed_bats_review - Copy.{i+1}" for i in range(split_cols.shape[1])]

# Concatenate the split columns back to the dataframe
df = pd.concat([df, split_cols], axis=1)

# Rename columns
df = df.rename(columns={
    "title_bed_bats_review - Copy.2": "Bedroom",
    "title_bed_bats_review - Copy.3": "Beds",
    "title_bed_bats_review - Copy.4": "Baths"
})

# Duplicate the "Bedroom" column
df['Bedroom - Copy'] = df['Bedroom']

# Remove "★" from "Bedroom - Copy" column
df['Bedroom - Copy'] = df['Bedroom - Copy'].str.replace('★', '')

# Add conditional columns
df['Bedroom_Main'] = df.apply(lambda x: x['Bedroom - Copy'] if 'Bed' in str(x['Bedroom - Copy']) else None, axis=1)
df['Ratings'] = df.apply(lambda x: x['Bedroom - Copy'] if x['Bedroom_Main'] is None else None, axis=1)

# Add "Main_Bedroom" column
df['Main_Bedroom'] = df.apply(lambda x: x['Beds'] if x['Bedroom_Main'] is None else x['Bedroom_Main'], axis=1)

# Replace "Bedroom " with "Bedrooms" in "Beds" column
df['Beds'] = df['Beds'].str.replace('Bedroom ', 'Bedrooms')

# Add "Beds_main" column
df['Beds_main'] = df.apply(lambda x: x['Baths'] if 'Bedrooms' in str(x['Beds']) else x['Beds'], axis=1)

# Remove columns
df = df.drop(columns=['Bedroom', 'Bedroom - Copy', 'Bedroom_Main'])

# Add "Main_Beds" column
df['Main_Beds'] = df.apply(lambda x: x['Beds_main'] if 'Bath' not in str(x['Beds_main']) else None, axis=1)

# Remove "Beds" column
df = df.drop(columns=['Beds'])

# Add "Bath_Main" column
df['Bath_Main'] = df.apply(lambda x: x['Beds_main'] if pd.isnull(x['Baths']) else x['Baths'], axis=1)

# Add "Main_Baths" column
df['Main_Baths'] = df.apply(lambda x: x['Bath_Main'] if pd.isnull(x['title_bed_bats_review - Copy.5']) else x['title_bed_bats_review - Copy.5'], axis=1)

# Remove columns
df = df.drop(columns=['Beds_main', 'Baths', 'title_bed_bats_review - Copy.5', 'Bath_Main'])

# Replace unwanted strings in columns
replace_patterns = [
    ("Bedroom", ""), ("s", ""), ("Beds", ""), ("Bed", ""),
    ("Bath", ""), ("Private", ""), ("Shared", ""), ("Half", "1"),
    ("-", ""), ("Shared Bath", ""), ("Studio", "")
]

for old, new in replace_patterns:
    df['Main_Bedroom'] = df['Main_Bedroom'].str.replace(old, new)
    df['Main_Beds'] = df['Main_Beds'].str.replace(old, new)
    df['Main_Baths'] = df['Main_Baths'].str.replace(old, new)

# Rename columns
df = df.rename(columns={"title_bed_bats_review - Copy.1": "Title"})

# Remove the original 'title_bed_bats_review' column
df = df.drop(columns=['title_bed_bats_review'])



In [2]:
df

Unnamed: 0,listing_link,title_bed_bats_review - Copy,Title,Ratings,Main_Bedroom,Main_Beds,Main_Baths
0,https://www.airbnb.com/rooms/31699739?adults=1...,"Home In Mymensingh , ★4.50 , 3 Bedrooms , 4 Be...",Home In Mymensingh,4.50,3,4,2
1,https://www.airbnb.com/rooms/94052072774264244...,"Resort In Beltoli , 1 Bedroom , 1 Bed , 1 Bath",Resort In Beltoli,,1,1,1
2,https://www.airbnb.com/rooms/93979221326458612...,"Resort In Mymensingh , 1 Bedroom , 1 Private Bath",Resort In Mymensingh,,1,,1
3,https://www.airbnb.com/rooms/93979244890044911...,"Resort In Mymensingh , 1 Bedroom , 1 Private Bath",Resort In Mymensingh,,1,,1
4,https://www.airbnb.com/rooms/95562398776708889...,"Resort In Mymensingh , 1 Bedroom , 1 Private Bath",Resort In Mymensingh,,1,,1
...,...,...,...,...,...,...,...
1560,https://www.airbnb.com/rooms/11301829800703733...,"Rental Unit In Barisal , ★New , 20 Bedrooms , ...",Rental Unit In Barisal,New,20,,20
1561,https://www.airbnb.com/rooms/72423423715114280...,"Farm Stay In Nalchhiti , 1 Bedroom , 1 Shared ...",Farm Stay In Nalchhiti,,1,,1
1562,https://www.airbnb.com/rooms/83149822616480789...,"Rental Unit In Bhola District , 1 Bedroom , 1 ...",Rental Unit In Bhola District,,1,1,1
1563,https://www.airbnb.com/rooms/10257503846395791...,"Home In Fakirhat , 2 Bedrooms , 2 Beds , 1 Bath",Home In Fakirhat,,2,2,1


In [3]:
# Save the result to a new CSV file
df.to_csv("./clean_checking.csv", index=False)

In [5]:
df.iloc[117]

listing_link                    https://www.airbnb.com/rooms/60465951598303382...
title_bed_bats_review - Copy    Serviced Apartment In Cox'S Bazar , 2 Bedrooms...
Title                                          Serviced Apartment In Cox'S Bazar 
Ratings                                                                      None
Main_Bedroom                                                                  2  
Main_Beds                                                                     3  
Main_Baths                                                                     2 
Name: 117, dtype: object