In [3]:
#
def listing_preprocessing(root_folder, city_area):
    ######################################################################################################
    import time

    # Runtime starting time
    start_time = time.time()

    import pandas as pd 
    import numpy as np
    import os 
    from datetime import datetime as dt

    ######################################################################################################

    root = root_folder
    #os.chdir(root)
    df = pd.read_csv(root, low_memory=False)

    ######################################################################################################

    def pandas_display_options(rows, columns, width):
        '''
        This function is suited for EDA. It allows you to display samples of big dataframes in iPython.
        '''
        pd.set_option('display.max_rows', rows)
        pd.set_option('display.max_columns', columns) # So we can display all columns
        pd.set_option('display.width', width)

    ######################################################################################################

    def dtype_groups(df):
        '''
        A very elegant way of displaying the features, grouped by their data types.
        '''
        return (df.columns.to_series().groupby(df.dtypes).groups)

    ######################################################################################################

    def amenities_counter(amenities):
        '''
        Counts the number of amenities. I don't think they're that great.
        To run this function, try: df["amenities_counter"] = df.amenities.map(amenities_counter)
        '''
        return len(amenities.split(","))

    ######################################################################################################

    def top_ammenities(df, top =None, plot = False):
        '''
        This function takes apart that horrible looking dictionary and returns for us the x most common/uncommon ammenities.
        It can also plot them if you like.
        '''
        from collections import Counter
        results = Counter()
        df['amenities'].str.strip('{}')\
                       .str.replace('"', '')\
                       .str.lstrip('\"')\
                       .str.rstrip('\"')\
                       .str.split(',')\
                       .apply(results.update)

        if plot == True:
            import matplotlib.pyplot as plt
            # create a new dataframe
            sub_df0 = pd.DataFrame(results.most_common(top), columns=['amenity', 'count'])
            # plot the Top 
            sub_df0.sort_values(by=['count'], ascending=True).plot(kind='barh', x='amenity', y='count',  
                                                          figsize=(10,7), legend=False, color='darkgrey',
                                                          title='Amenities')
            plt.xlabel('Count');


        return(results.most_common(top))

    ######################################################################################################

    def get_nulls(df):
        '''
        This function prints out the columns which have or do not have null values and the sum of nulls.
        Then, if present, it finally prints out only the names of the columns which are problematic. 

        Input: A data frame. 

        '''
        columns = df.columns

        # Print which columns have nulls or not. 
        print('\033[1m')
        print("Information about which columns have nulls or not:")
        print('\033[0m')
        print(df.isna().any())

        # How many nulls in each column?
        print('\033[1m')
        print("How many null values does each variable hold?")
        print('\033[0m')
        print(df.isna().sum())

        # List to store result 
        problem_columns = []

        # Names of columns with problems. 
        for i in range (0, len(df.isna().sum())):
            if df.isna().sum()[i] == 0:
                problem_columns.append(columns[i])
        if not problem_columns:
            print('\033[1m\nThere are no variables affected by null values.\033[0m')
        else:
            print('\033[1m\nThese are the column names of the variables which present problems:\033[0m')
            print(problem_columns)

    ######################################################################################################

    ''' MISCELANEOUS '''

    ######################################################################################################

    def convert_to_euro(amount,currency):
        '''
        This converter uses no API at the moment. Just need to update the currency rate list.
        You input the amount of currency you want to convert and the currency you convert to.
        '''
        rates = [(0.13,"DKK"),(1.20,"GBP"),(1.0,"EUR")]

        def getall(my_list, s):
            index = [x for x, y in my_list if y==s] # Very versatile. 
            return (index[0])

        rate = getall(rates, currency)
        out = amount * rate
        return (out)

    ######################################################################################################

    def text_word_len(text):
        '''
        Returns the length of a text (in number of words used).
        This could ascribe some quantifiable value to strings.
        '''
        return(len(text.split(" ")))

    ######################################################################################################

    def amenities_list_counter(amenities):
        '''
        Returns the length of an amenities group list.
        '''
        return len(amenities)

    ######################################################################################################

    ''' DATA TYPE CONVERSION '''

    ######################################################################################################

    def listings_to_datetime(df, feature):
        df[feature] = pd.to_datetime(df[feature])

    ######################################################################################################

    def listings_to_float(df, feature):
        df[feature] = df[feature].astype(float)

    ######################################################################################################

    def price_to_float(df, feature):
        '''
        This function deletes dollar signs from prices and then converts the remaining string to float.
        '''
        df[feature] = df[feature].replace('[\$,]', '', regex=True).astype(float)
        df[feature] = pd.to_numeric(df[feature])

    ######################################################################################################

    def percent_string_to_number(df, feature):
        '''
        Replaces the string % and then converts the number to a percentage between 0 and 1.
        Here, it's host response rate.
        '''
        df[feature] = df[feature].str.replace('%', '').astype(float)
        df[feature] = df[feature] / 100

    ######################################################################################################

    def memory_optimization(df, feature):
        '''
        Transforming features to category instead of obsects saves up a lot of physical memory space.
        '''
        df[feature] = df[feature].astype('category')

    ######################################################################################################

    ''' NANS, OUTLIERS AND DUPLICATES'''

    ######################################################################################################

    def fill_nans(df,feature,filling="mean"):
        '''
        Fills Nans by replacing them with either median, mean, or a value chosen by the user.
        Default filling value is mean.
        '''
        if filling == "median":
            median = df[feature].median()
            df[feature] = df[feature].fillna(median)
        elif filling == "mean":
            mean = df[feature].mean()
            df[feature] = df[feature].fillna(mean)
        else:
            df[feature].fillna(filling, inplace=True)

    ######################################################################################################

    def remove_outliers(df,feature, bound = "both", method="iqr"):
        '''
        This function removes outliers in a versatile way, according to needs and preferences.
        The method is either Interquartilic Range, Z-Scores or just deleting the dop and bottom 1% values.
        Furthermore, you can specify if you want to remove the lower or upper bound outliers. Or both.
        '''
        if method == "iqr":
            q1 = df[feature].quantile(0.25)
            q3 = df[feature].quantile(0.75)
            iqr = q3-q1
            lower_bound = q1 - (1.5*iqr)
            upper_bound = q3 + (1.5*iqr)

            if bound == "both":
                df.drop(df[ (df[feature] > upper_bound) & (df[feature]) < lower_bound].index, axis=0, inplace=True)
            elif bound == "lower":
                df.drop(df[ df[feature] < lower_bound ].index, axis=0, inplace=True)
            elif bound == "upper":
                df.drop(df[ df[feature] > upper_bound ].index, axis=0, inplace=True)

        elif method == "zscore":
            from scipy import stats
            df.drop(df[ np.abs(stats.zscore(df[feature]) < 3)], axis=0, inplace=True)

        elif method == "1%":
            qhigh = df[feature].quantile(0.99)
            qlow = df[feature].quantile(0.01)

            if bound == "both":
                df.drop(df[ (df[feature] > qhigh) & (df[feature] < qlow) ].index, axis=0, inplace=True)
            elif bound == "lower":
                df.drop(df[ df[feature] < qlow ].index, axis=0, inplace=True)
            elif bound == "upper":
                df.drop(df[ df[feature] > qhigh ].index, axis=0, inplace=True)

    ######################################################################################################

    def delete_rows_conditional(df, condition1, threshold1, feature1,
                    condition2 = None, threshold2 = None, feature2 = None, 
                    logical_operator = None):
        '''

        This function deletes rows based on certain features, conditions and thresholds. 
        The features and conditions are not unique and can repeat themselves.
        The logical operator parameter dictates the relationship between the two conditions.

        The following parameters can take only the following values:
            - condition = ['greater_than', 'lesser_than', 'equal_to', 'not_equal_to']
            - logical_operator = ['and', 'or']

        '''
        ########### 1) ONE CONDITION CASE ##########
        if condition2 == None:

            if condition1 == 'greater_than':
                df.drop(df[ (df[feature1] > threshold1) ].index, axis=0, inplace=True)

            elif condition1 == 'lesser_than':
                df.drop(df[ (df[feature1] < threshold1) ].index, axis=0, inplace=True)

            elif condition1 == 'equal_to':
                df.drop(df[ (df[feature1] == threshold1) ].index, axis=0, inplace=True)

            elif condition1 == 'not_equal_to':
                df.drop(df[ (df[feature1] != threshold1) ].index, axis=0, inplace=True)


        ########### 2) TWO CONDITIONS CASE ##########

        #################################################################################################################
        elif condition1 == 'greater_than' and condition2 == 'greater_than' and condition3 == None:
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] > threshold1) and (df[feature2] > threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] > threshold1) | (df[feature2] > threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'greater_than' and condition2 == 'lesser_than' and condition3 == None:
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] > threshold1) and (df[feature2] < threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] > threshold1) | (df[feature2] < threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'lesser_than' and condition2 == 'greater_than' and condition3 == None:
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] < threshold1) and (df[feature2] > threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] < threshold1) | (df[feature2] > threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'lesser_than' and condition2 == 'lesser_than':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] < threshold1) and (df[feature2] < threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] < threshold1) | (df[feature2] < threshold2) ].index, axis=0, inplace=True)

        #################################################################################################################        

        elif condition1 == 'equal_to' and condition2 == 'equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] == threshold1) and (df[feature2] == threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] == threshold1) | (df[feature2] == threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'not_equal_to' and condition2 == 'not_equal_to':

            if logical_operator == 'and':
                df.drop(df[ (df[feature1] != threshold1) and (df[feature2] != threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] != threshold1) | (df[feature2] != threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'equal_to' and condition2 == 'not_equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] == threshold1) and (df[feature2] != threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] == threshold1) | (df[feature2] != threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'not_equal_to' and condition2 == 'equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] != threshold1) and (df[feature2] == threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] != threshold1) | (df[feature2] == threshold2) ].index, axis=0, inplace=True)

        #################################################################################################################

        elif condition1 == 'greater_than' and condition2 == 'equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] != threshold1) and (df[feature2] == threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] != threshold1) | (df[feature2] == threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'greater_than' and condition2 == 'not_equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] > threshold1) and (df[feature2] != threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] > threshold1) | (df[feature2] != threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'equal_to' and condition2 == 'greater_than':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] == threshold1) and (df[feature2] > threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] == threshold1) | (df[feature2] > threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'not_equal_to' and condition2 == 'greater_than':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] != threshold1) and (df[feature2] > threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] != threshold1) | (df[feature2] > threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'lesser_than' and condition2 == 'equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] < threshold1) and (df[feature2] == threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] < threshold1) | (df[feature2] == threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'lesser_than' and condition2 == 'not_equal_to':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] < threshold1) and (df[feature2] != threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] < threshold1) | (df[feature2] != threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'equal_to' and condition2 == 'lesser_than':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] == threshold1) and (df[feature2] < threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] == threshold1) | (df[feature2] < threshold2) ].index, axis=0, inplace=True)

        elif condition1 == 'not_equal_to' and condition2 == 'lesser_than':
            if logical_operator == 'and':
                df.drop(df[ (df[feature1] != threshold1) and (df[feature2] < threshold2) ].index, axis=0, inplace=True)
            elif logical_operator == 'or':
                df.drop(df[ (df[feature1] != threshold1) | (df[feature2] < threshold2) ].index, axis=0, inplace=True)

        else:
            print("Exception: An error has occured. Please refer to the function's docstring and make sure you specify the parameters correctly.")

    ######################################################################################################

    ''' FEATURE ENCODING '''

    ######################################################################################################

    def make_dummy(df, feature):
        dummy = pd.get_dummies(df[feature])
        return(dummy)

    ######################################################################################################

    def encode_label(df, feature):
        '''
        Just the Sklearn LabelEncoder but made easier to implement.
        Obs: It's usually preferable to just one-hot encode the data. 
        Label encoding makes the machine think the numbers have some kind of ordinal meaning.
        '''
        from sklearn.preprocessing import LabelEncoder
        labelencoder = LabelEncoder()
        df[feature] = labelencoder.fit_transform(df[feature])

    ######################################################################################################

    def map_feature(df, feature):
        '''
        Just encodes f(false) as 0 and t(true) as 1. 
        '''
        df[feature] = df[feature].map({"f": 0, "t": 1})

    ######################################################################################################

    ''' NORMALIZATION '''

    ######################################################################################################

    def normalize_price(df,feature,method="standard score"):
        '''
        In statistics, the standard score is the signed fractional number of standard 
        deviations by which the value of an observation or data point is above or below 
        the mean value of what is being observed or measured.
        '''
        # This should get rid of the need to convert currency.
        # And, most importantly, we can now concatenate different listings DF for different cities and even times.
        if method == "standard score":
            df[feature] = (df[feature] - df[feature].mean())/df[feature].std()
            normalized_df=(df-df.mean())/df.std()

    ######################################################################################################

    ''' FEATURE DELETION '''

    ######################################################################################################

    def drop_or_keep_feature(df, threshold, print_option = False):
        ''' 
        This function deletes features that contain more than x% missing values. 
        Or rather, it returns a tuple containing the names of features to keep and features to drop.
        It also prints them out, if desired. Default is false.
        Threshold example: 30% is 0.3.
        '''
        cols = df.columns.values # Names of features
        lenght = df.shape[0] 
        # Initialize lists
        keep_list = [] 
        drop_list = []

        for i in cols:
            x = df[i]
            rate = round(x.isna().sum()/lenght,3)
            if rate < threshold:
                keep_list.append(i)
                if print_option == True:
                    print(i,"contain :%",100*rate ,"KEEP.")
            else:
                drop_list.append(i)
                if print_option == True:
                    print(i,"contain :%",rate*100 ,"DROP !!!")

        return(keep_list, drop_list)

    ######################################################################################################

    ''' FEATURE ENGINEERING '''

    ######################################################################################################

    def city_center(city=df["city"].iloc[0]):
        from geopy import Nominatim
        locator = Nominatim(user_agent="aleen_prd")
        location = locator.geocode(city, timeout=5)
        center = (location.latitude, location.longitude)
        return(center)

    ######################################################################################################

    def distance_to_center(acc_lat, acc_lon, center, city_area):
        '''
        Uses an API to calculate the distance in km to the city center of an Airbnb location.
        '''
        from geopy.distance import great_circle
        accommodation = (acc_lat, acc_lon)
        distance = great_circle(center, accommodation).km
        return (distance/city_area)

    ######################################################################################################

    def host_age(df, present_year):
        df['host_age'] = (present_year - df['host_since'].dt.year)

    ######################################################################################################

    def assign_property_type_group():
        groups_list = ["apartment","house","secondary","unique","b&b","hotel","ambiguous"]
        groups_lis_list = [apartmentL,houseL,secondaryL,uniqueL,bedbreakfastL,hotelL,ambiguousL]

        for g in range(len(groups_list)):
            df.loc[df['property_type'].isin(groups_lis_list[g]), 'property_type_groups'] = groups_list[g]

    ######################################################################################################

    def common(row):
        out = []
        for amenity in row['amenities']:
            if amenity in commonL:
                out.append(amenity)
        return(out)
    def additional(row):
        out = []
        for amenity in row['amenities']:
            if amenity in additionalL:
                out.append(amenity)
        return(out)
    def family(row):
        out = []
        for amenity in row['amenities']:
            if amenity in familyL:
                out.append(amenity)
        return(out)
    def logistics(row):
        out = []
        for amenity in row['amenities']:
            if amenity in logisticsL:
                out.append(amenity)
        return(out)
    def homesafety(row):
        out = []
        for amenity in row['amenities']:
            if amenity in homesafetyL:
                out.append(amenity)
        return(out)
    def location(row):
        out = []
        for amenity in row['amenities']:
            if amenity in locationL:
                out.append(amenity)
        return(out)
    def pets(row):
        out = []
        for amenity in row['amenities']:
            if amenity in petsL:
                out.append(amenity)
        return(out)
    def access(row):
        out = []
        for amenity in row['amenities']:
            if amenity in accessL:
                out.append(amenity)
        return(out)
    def specialneeds(row):
        out = []
        for amenity in row['amenities']:
            if amenity in specialneedsL:
                out.append(amenity)
        return(out)
    def spacious(row):
        out = []
        for amenity in row['amenities']:
            if amenity in spaciousL:
                out.append(amenity)
        return(out)
    def other(row):
        out = []
        for amenity in row['amenities']:
            if amenity not in all_amenities:
                out.append(amenity)
        return(out)

    ######################################################################################################

    # Setting up optimal display for a dataframe with so many features.
    pandas_display_options(3,150,150)

    ######################################################################################################

    commonL = ['Essentials','Kitchen','Air conditioning','Heating','Hair dryer','Hangers','Iron','Washer','Dryer','Hot water',
              'TV','Cable TV','Indoor fireplace','Private entrance','Private living room','Lock on bedroom door','Shampoo',
              'Shower gel','Bed linens','Extra pillows and blankets','Wifi','Ethernet connection','Pocket wifi','Laptop-friendly workspace',
              'Internet','Laptop friendly workspace','Washer / Dryer',' toilet']

    additionalL = ['Microwave','Coffee maker','Refrigerator','Dishwasher','Dishes and silverware','Cooking basics','Oven','Stove',
                  'Bread maker','Baking sheet','Barbeque utensils','Trash can','Free parking on premises','Free street parking',
                  'Paid parking off premises','Paid parking on premises','EV charger','Gym','Pool','Hot tub','Single level home',
                  'BBQ grill','Patio or balcony','Garden or backyard','Breakfast','Beach essentials',
                  'Handheld shower head','Lockbox','Hot water kettle','Firm mattress']

    familyL = ['Baby bath','Baby monitor','Babysitter recommendations','Bathtub','Changing table',"Children's books and toys","Children's dinnerware",
              'Crib','Fireplace guards','Game console','High chair','Outlet covers',"Pack'n Play/travel crib",'Room-darkening shades',
              'Stair gates','Table corner guards','Window guards',
              'Family/kid friendly','Pack ’n Play/travel crib','Children’s books and toys','Children’s dinnerware']

    logisticsL = ['Luggage dropoff allowed','Cleaning before checkout','Long term stays allowed',
                 'Host greets you','24-hour check-in','Self check-in','Smoking allowed','Suitable for events']

    homesafetyL = ['Fire extinguisher','Carbon monoxide alarm','Smoke alarm','First aid kit',
                  'Smoke detector','Carbon monoxide detector','Accessible-height bed','Safety card',
                  'Accessible-height toilet',]

    locationL = ['Beachfront','Lake access','Ski-in/Ski-out','Waterfront']

    petsL = ['Pets allowed','Cat(s)','Dog(s)','Pets live on this property']

    accessL = ['Buzzer/wireless intercom','Smart lock','No stairs or steps to enter','Elevator','Well-lit path to entrance',
              'Buzzer/wireless intercom','Ground floor access','Flat path to guest entrance','Doorman',
              'Keypad']

    specialneedsL = ['Step-free shower','Electric profiling bed','Wheelchair accessible','Disabled parking spot','Building staff']

    spaciousL = ['Wide entrance for guests','Wide hallways','Wide entrance','Wide entryway','Extra space around bed',
                'Wide doorway to guest bathroom','Wide clearance to shower']

    all_amenities = commonL + additionalL + familyL + logisticsL + homesafetyL + locationL + petsL + accessL + specialneedsL + spaciousL

    #####################################################################################################

    apartmentL = ['Apartment', 'Condominium', 'Loft']

    houseL = ['House', 'Bungalow', 'Cabin', 'Chalet, Cottage', 'Cycladic house (Greece)', 
             'Dammuso (Italy)', 'Dome house', 'Lighthouse', 'Townhouse', 'Trullo (Italy)']

    secondaryL = ['Guesthouse', 'Guest suite']

    uniqueL = ['Barn', 'Boat', 'Bus', 'Camper/RV', 'Campsite', 'Castle', 'Cave', 'Dome house',
              'Igloo', 'Island', 'Lighthouse', 'Plane', 'Tent', 'Tipi', 'Train', 'Treehouse', 'Windmill', 'Yurt']

    bedbreakfastL = ['Bed and breakfast', 'Minsu (Taiwan)','Ryokan (Japan)']

    hotelL = ['Boutique hotel', 'Aparthotel', 'Heritage hotel (India)', 'Hostel', 'Hotel',
             'Resort', 'Kezhan (China)']

    ambiguousL = ['Casa particular (Cuba)', 'Serviced apartment', 'Pension (South Korea)','Tiny house',"Shepherd's hut (U.K.,France)",
            'Houseboat','Hut','Farm stay','Earth house']

    all_listing_types = apartmentL + houseL + secondaryL + uniqueL + bedbreakfastL + hotelL + ambiguousL

    #####################################################################################################

    df["amenities"] = df["amenities"].apply(lambda x: x.replace('"', ''))
    df["amenities"] = df["amenities"].apply(lambda x: x[1:-1].replace("\'", "").split(","))

    #####################################################################################################

    df['amenities_common'] = df.apply(common, axis=1)
    df['amenities_additional'] = df.apply(additional, axis=1)
    df['amenities_family'] = df.apply(family, axis=1)
    df['amenities_logistics'] = df.apply(logistics, axis=1)
    df['amenities_safety'] = df.apply(homesafety, axis=1)
    df['amenities_location'] = df.apply(location, axis=1)
    df['amenities_pets'] = df.apply(pets, axis=1)
    df['amenities_access'] = df.apply(access, axis=1)
    df['amenities_special_needs'] = df.apply(specialneeds, axis=1)
    df['amenities_spacious'] = df.apply(spacious, axis=1)
    df['amenities_other'] = df.apply(other, axis=1)

    #####################################################################################################

    df['amenities_common'] = df['amenities_common'].map(amenities_list_counter)
    df['amenities_additional'] = df['amenities_additional'].map(amenities_list_counter)
    df['amenities_family'] = df['amenities_family'].map(amenities_list_counter)
    df['amenities_logistics'] = df['amenities_logistics'].map(amenities_list_counter)
    df['amenities_safety'] = df['amenities_safety'].map(amenities_list_counter)
    df['amenities_location'] = df['amenities_location'].map(amenities_list_counter)
    df['amenities_pets'] = df['amenities_pets'].map(amenities_list_counter)
    df['amenities_access'] = df['amenities_access'].map(amenities_list_counter)
    df['amenities_special_needs'] = df['amenities_special_needs'].map(amenities_list_counter)
    df['amenities_spacious'] = df['amenities_spacious'].map(amenities_list_counter)
    print("Amenities done.")
    #####################################################################################################

    assign_property_type_group()
    fill_nans(df,"property_type_groups","other")
    dummy = make_dummy(df,"property_type_groups")
    df = df.join(dummy)
    print("Property type done.")
    #####################################################################################################

    center = city_center()
    #city_area = 219.32
    df['distance_to_center'] = df.apply(lambda x: distance_to_center(x.latitude, x.longitude, center, city_area), axis=1)
    print("Distance done.")

    #####################################################################################################
    ######################################################################################################

    to_datetime_list = ['host_since'] # "last_scraped"
    for f in to_datetime_list:
        listings_to_datetime(df, f)

    price_to_num_list = ['price','security_deposit','cleaning_fee','extra_people','weekly_price','monthly_price']
    for f in price_to_num_list:
        price_to_float(df,f)

    percent_string_to_number(df, 'host_response_rate')

    print("Dates, percentages and prices done.")
    #####################################################################################################

    host_age(df,2020)
    print("Host age done.")
    #####################################################################################################

    text_features_list = ['name', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes',
                            'transit', 'access', 'interaction', 'house_rules', 'host_about']

    for f in text_features_list:
        fill_nans(df,f,"")

    for f in text_features_list:
        df[f] = df[f].str.split().str.len()
    print("Text length done.")
    df['summary_text'] = df['summary']
    df['summary'] = df['summary'].str.split().str.len()

    #####################################################################################################
    #####################################################################################################

    filling = [0.0, 1.0, "f", "t",'no_response',"no bed", "no bathroom","median", "mean"]

    # Filling NaN with zeros
    fillna_zeros_features_list = ["price",'weekly_price','monthly_price',"extra_people","cleaning_fee",
                                  "security_deposit","bathrooms","bedrooms","beds",
                                  "review_scores_rating","review_scores_accuracy"]
    for f in fillna_zeros_features_list:
        fill_nans(df,f,filling[0])

    # Filling NaN with f
    fillna_features_list = ["host_is_superhost","instant_bookable","is_business_travel_ready", "host_identity_verified",
                            "require_guest_profile_picture","require_guest_phone_verification"]
    for f in fillna_features_list:
        fill_nans(df,f,filling[2])

    # Filling NaN with "no response"
        fill_nans(df,"host_response_time",filling[4])

    # Filling NaN with "no bed"
    fill_nans(df,"bed_type",filling[5])

    # Filling NaN with mean
    fill_nans(df,'price',filling[8])
    print("Filling NaNs done.")

    fill_nans(df,'host_age',0)
    fill_nans(df,'reviews_per_month',0)
    fill_nans(df,'host_total_listings_count',1)
    fill_nans(df,'host_response_rate',0)

    #####################################################################################################
    #####################################################################################################

    features_with_outliers_list_0 = ["price"]
    for f in features_with_outliers_list_0:
        remove_outliers(df, f, bound = "upper", method = "iqr")

    features_with_outliers_list_1 = ["cleaning_fee","security_deposit","extra_people",
                                     "accommodates","bathrooms","bedrooms","beds",
                                     "host_total_listings_count"]
    for f in features_with_outliers_list_1:
        remove_outliers(df, f, bound = "upper", method = "iqr")

    delete_rows_conditional(df, feature1='maximum_nights', condition1='greater_than', threshold1=1125)
    delete_rows_conditional(df, feature1='accommodates', condition1='equal_to', threshold1=0)
    delete_rows_conditional(df, feature1='price', condition1='equal_to', threshold1=0)
    print("Outliers statistically removed done.")
    #####################################################################################################

    label_encoding_list = ["host_identity_verified","host_has_profile_pic","instant_bookable",
                          "host_is_superhost","is_business_travel_ready",
                           "require_guest_profile_picture","require_guest_phone_verification"]
    for f in label_encoding_list:
        map_feature(df,f)
    print("Labels encoded done.") 
    #####################################################################################################

    dummy_feature_list = ["host_response_time","cancellation_policy",
                         "room_type","bed_type"] 
    # "host_verifications"
    for f in dummy_feature_list:
        dumm = make_dummy(df,f)
        df = pd.concat([df,dumm], axis=1)
    print("Makind dummies done.")
    #####################################################################################################
    #
    drop_list = ['thumbnail_url',
     'medium_url',
     'xl_picture_url',
     'host_name',
     'host_since',
     'host_location',
     'host_acceptance_rate',
     'host_thumbnail_url',
     'host_picture_url',
     'host_neighbourhood',
     'host_listings_count',
     'host_has_profile_pic',
     'neighbourhood',
     'neighbourhood_group_cleansed',
     'state',
     'zipcode',
     'market',
     'square_feet',
     'first_review',
     'last_review',
     'review_scores_cleanliness',
     'review_scores_checkin',
     'review_scores_communication',
     'review_scores_location',
     'review_scores_value',
     'license',
     'jurisdiction_names']
    #####################################################################################################

    df.drop(drop_list, axis=1,inplace=True)
    df.drop(['amenities','amenities_other'], axis=1,inplace=True)
    df.drop(["property_type","property_type_groups"], axis=1,inplace=True)

    additional_drop_list = ["scrape_id","last_scraped","name","picture_url","host_id","host_url",
                           "host_verifications","street","neighbourhood_cleansed","city","smart_location","country_code",
                           "country","latitude","longitude","is_location_exact","room_type","bed_type",
                           "minimum_minimum_nights","maximum_minimum_nights","minimum_maximum_nights",
                            "maximum_maximum_nights","minimum_nights_avg_ntm","maximum_nights_avg_ntm","calendar_updated","calendar_last_scraped",
                            "has_availability","availability_30","availability_60","availability_90","availability_365",
                            "number_of_reviews_ltm","requires_license",
                            "host_identity_verified","experiences_offered",
                            "host_response_time","cancellation_policy",
                            #"listing_url","id"
                           ]
    df.drop(additional_drop_list, axis=1,inplace=True)
    print("Dropped some extra features.")
    #####################################################################################################

    #normalization_list = ['price','security_deposit','cleaning_fee','extra_people','weekly_price','monthly_price']
    #for n in normalization_list:
       # normalize_price(df,n)
    #print("Normalization done.")

    # Runtime ending time
    end_time = time.time()

    print("Finished in: ", end_time - start_time)
    return(df)
    

### 1) Amsterdam

In [4]:
import pandas as pd 
ams = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\amsterdam\listings\listings.csv")
ams.shape

  interactivity=interactivity, compiler=compiler, result=result)


(20025, 106)

In [5]:
amsterdam = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\amsterdam\listings\listings.csv",219.32)
amsterdam.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  13.160455226898193


In [10]:
amsterdam.to_csv(r"C:\Users\aleen\Desktop\Reviews\amsterdam\amsterdam_listings_root.csv")

### 2) Athens

In [11]:
ath = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\athens\athens_listings.csv")
ath.shape

  interactivity=interactivity, compiler=compiler, result=result)


(11396, 106)

In [12]:
athens = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\athens\athens_listings.csv",412)
athens.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  7.308902740478516


In [41]:
athens['security_deposit'].mean()

0.0

In [13]:
athens.to_csv(r"C:\Users\aleen\Desktop\Reviews\athens\athens_listings_root.csv")

### 3) Copenhagen

In [15]:
cop = ber = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\copenhagen\copenhagen_listings.csv")
cop.shape

  interactivity=interactivity, compiler=compiler, result=result)


(28418, 106)

In [16]:
copenhagen = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\copenhagen\copenhagen_listings.csv",292.5)
copenhagen.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  16.681175470352173


(12325, 78)

In [17]:
copenhagen.to_csv(r"C:\Users\aleen\Desktop\Reviews\copenhagen\copenhagen_listings_root.csv")

In [42]:
copenhagen['security_deposit'].mean()

0.0

In [26]:
def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

In [27]:
Diff(list(athens.columns.values),list(copenhagen.columns.values))

['super_strict_30']

### 4) Berlin 

In [39]:
ber = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\berlin\berlin_listings.csv")
ber.shape

  interactivity=interactivity, compiler=compiler, result=result)


(25197, 106)

In [18]:
berlin = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\berlin\berlin_listings.csv",891.1)
berlin.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  15.47740912437439


(9592, 79)

In [43]:
berlin.security_deposit.mean()

29.0284612176814

In [19]:
berlin.to_csv(r"C:\Users\aleen\Desktop\Reviews\berlin\berlin_listings_root.csv")

### 5) Madrid

In [42]:
mad = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\madrid\madrid_listings.csv")
mad.shape

  interactivity=interactivity, compiler=compiler, result=result)


(21845, 106)

In [20]:
madrid = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\madrid\madrid_listings.csv",604.3)
madrid.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  13.804437398910522


(9204, 80)

In [21]:
madrid.to_csv(r"C:\Users\aleen\Desktop\Reviews\madrid\madrid_listings_root.csv")

### London

In [47]:
lon = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\london\london_listings.csv")
lon.shape

  interactivity=interactivity, compiler=compiler, result=result)


(87571, 106)

In [30]:
london = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\london\london_listings.csv",1737.9)
london.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  52.11855912208557


(30002, 81)

In [31]:
london.to_csv(r"C:\Users\aleen\Desktop\Reviews\london\london_listings_root.csv")

### Paris

In [50]:
par = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\paris\paris_listings.csv")
par.shape

  interactivity=interactivity, compiler=compiler, result=result)


(66414, 106)

In [32]:
paris = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\paris\paris_listings.csv",105.4)
paris.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  40.774916887283325


(33203, 80)

In [33]:
paris.to_csv(r"C:\Users\aleen\Desktop\Reviews\paris\paris_listings_root.csv")

### Rome

In [56]:
rom = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\rome\rome_listings.csv")
rom.shape

  interactivity=interactivity, compiler=compiler, result=result)


(31202, 106)

In [27]:
rome = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\rome\rome_listings.csv",1285)
rome.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  19.420282125473022


(15264, 80)

In [29]:
rome.to_csv(r"C:\Users\aleen\Desktop\Reviews\rome\rome_listings_root.csv")

### Prague

In [57]:
pra = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\prague\prague_listings.csv")
pra.shape

  interactivity=interactivity, compiler=compiler, result=result)


(14560, 106)

In [25]:
prague = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\prague\prague_listings.csv",298)
prague.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  9.59473466873169


(7052, 80)

In [26]:
prague.to_csv(r"C:\Users\aleen\Desktop\Reviews\prague\prague_listings_root.csv")

### Vienna

In [59]:
vie = pd.read_csv(r"C:\Users\aleen\Desktop\Master Thesis\Data\vienna\vienna_listings.csv")
vie.shape

  interactivity=interactivity, compiler=compiler, result=result)


(13162, 106)

In [22]:
vienna = listing_preprocessing(r"C:\Users\aleen\Desktop\Master Thesis\Data\vienna\vienna_listings.csv",414.78)
vienna.shape

Amenities done.
Property type done.
Distance done.
Dates, percentages and prices done.
Host age done.
Text length done.
Filling NaNs done.
Outliers statistically removed done.
Labels encoded done.
Makind dummies done.
Dropped some extra features.
Finished in:  8.382618427276611


(5662, 80)

In [24]:
vienna.to_csv(r"C:\Users\aleen\Desktop\Reviews\vienna\vienna_listings_root.csv")