In [None]:
import os
import pandas as pd
import re
from multiprocessing import Pool
from langdetect_drop_column import process_dataframe

In [None]:
# Set the maximum column width to a large value to prevent text wrapping
pd.set_option('display.max_colwidth', None)

# Read listing csv files into dataframes

In [None]:
# read listings.csv into df
df_atd_list = pd.read_csv('../../cities/amsterdam/listings.csv', low_memory=False)
df_ldn_list = pd.read_csv('../../cities/london/listings2_London.csv', low_memory=False)
df_nyc_list = pd.read_csv('../../cities/new york/listings2_Newyork.csv', low_memory=False)
df_prs_list = pd.read_csv('../../cities/paris/listings2_Paris.csv', low_memory=False)
df_rom_list = pd.read_csv('../../cities/rome/listings2_Rome.csv', low_memory=False)

In [None]:
# create dataframe and name list
df_lists = [df_atd_list, df_ldn_list, df_nyc_list, df_prs_list, df_rom_list]
list_names = ['atd_list', 'ldn_list', 'nyc_list', 'prs_list', 'rom_list']

In [None]:
# list of columns to keep
# there are more unwanted columns than wanted. make sense to list out lesser items.
col_to_keep = ['id', 'name', 'host_since', 'host_location', 'host_is_superhost', 'host_identity_verified',
               'property_type', 'price', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
               'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']

# Remove unwanted columns and special characters

In [None]:
# for loop to remove:
# 1. unwanted columns
# 2. unwanted details from 'name' and 'host_location' columns
# 3. save cleaned dataframes to csv
for df, list_name in zip(df_lists, list_names):
    # remove unwanted columns.
    df = df.loc[:, col_to_keep]
    
    # clean up name column. remove everything from the first non-alphanumeric character
    # define a regular expression pattern to match the first non-alphanumeric character
    pattern = r'[^a-zA-Z0-9\s]'

    # remove the portion of the string starting from the first non-alphanumeric character
    df['name'] = df['name'].apply(lambda x: re.split(pattern, x)[0].strip())

    # convert id column to integer
    # df = df[pd.to_numeric(df['id'], errors='coerce').notna()]
    
    # remove '$' and ',' from 'price' column, and keeping the numbers as float
    df['price'] = df['price'].str.replace('[$,]', '', regex=True).astype(float).round(2)

    # Find the most common 'host_location' value
    most_common_location = df['host_location'].value_counts().idxmax()

    # Filter the DataFrame based on the most common location
    # This avoids hard coding a value like "Amsterdam, Netherlands"
    df = df[df['host_location'] == most_common_location]

    # df to csv
    df.to_csv(f'../../csv_cleaned/{list_name}.csv', index=False)


# < br/ > creates extra rows

<img src="../../images/br.png">

# Preprocess review csv files BEFORE reading into dataframes

In [None]:
# Preprocess the CSV file to replace <br/> and handle line breaks
# create a list of raw csv
raw_csvs = ['amsterdam/reviews.csv',
            'london/reviews2_London.csv',
            'new york/reviews2_Newyork.csv',
            'paris/reviews2_Paris.csv',
            'rome/reviews2_Rome.csv']

# create a list of processed csv names
processed_csv_names = ['amsterdam/p_reviews.csv',
                       'london/p_reviews2_London.csv',
                       'new york/p_reviews2_Newyork.csv',
                       'paris/p_reviews2_Paris.csv',
                       'rome/p_reviews2_Rome.csv']

In [None]:
for raw_csv, processed_csv_name in zip(raw_csvs, processed_csv_names):
    with (open(f'../../cities/{raw_csv}', 'r', encoding='utf-8') as infile,
          open(f'../../cities/{processed_csv_name}', 'w', encoding='utf-8') as outfile):
        for line in infile:
            # Replace <br/> with a space
            line = line.replace('<br/>', ' ')
            outfile.write(line)

In [None]:
# read reviews.csv into df
df_atd_rev = pd.read_csv('../../cities/amsterdam/p_reviews.csv')
df_ldn_rev = pd.read_csv('../../cities/london/p_reviews2_London.csv')
df_nyc_rev = pd.read_csv('../../cities/new york/p_reviews2_Newyork.csv')
df_prs_rev = pd.read_csv('../../cities/paris/p_reviews2_Paris.csv')
df_rom_rev = pd.read_csv('../../cities/rome/p_reviews2_Rome.csv')

# Parallel processing to optimize the process

In [None]:
if __name__ == '__main__':
    # create dataframe and name list
    df_revs = [df_atd_rev, df_ldn_rev, df_nyc_rev, df_prs_rev, df_rom_rev]
    rev_names = ['p_atd_rev', 'p_ldn_rev', 'p_nyc_rev', 'p_prs_rev', 'p_rom_rev']

    # list of unwanted columns.
    unwanted_col = ['id', 'reviewer_id', 'reviewer_name']

    # create a pool of processes
    num_processes = os.cpu_count() - 1
    pool = Pool(num_processes)

    # apply the processing function to dataframes in parallel
    processed_dfs = pool.starmap(process_dataframe, [(df_r, unwanted_col) for df_r in df_revs])

    # close and join the pool
    pool.close()
    pool.join()

    # for loop to save cleaned dataframes to csv
    for df, rev_name in zip(processed_dfs, rev_names):
        df.to_csv(f'../../csv_cleaned/{rev_name}.csv', index=False)