In [1]:
"""
DATA 144: Data Mining and Analytics - Fall 2019
Airbnb Project

Charis Chan. Joyce Ching. Chloe Lee. Brian Yang
"""

'\nDATA 144: Data Mining and Analytics - Fall 2019\nAirbnb Project\n\nCharis Chan. Joyce Ching. Chloe Lee. Brian Yang\n'

**Datasets**

Reviews: Detailed Review Data for listings in San Francisco

Listing: Detailed Listings data for San Francisco

(data from 2018-2019)

**Goal: predicting Airbnb prices for new listings in September 2019**

**Columns to Use**

Listings dataset:
* id
* last_scraped
* name
* summary 
* space
* description
* neighborhood_overview 
* transit
* house_rules (length, number of no's,fine, using vader sentiment) 
* host_id
* neighbourhood_cleansed (one-hot)
* property_type
* room_type
* accommodates
* bathrooms
* bedrooms
* beds
* bed_type(one-hot)
* amenities (one-hot)
* price
* weekly_price (is option?, \$ saved vs price)
* monthly_price (is option?, \$ saved vs price)
* security_deposit
* guest_included
* extra_people
* minimum_nights
* maximum_nights
* availability_30
* number_of_reviews
* number_of_reviews_ltm
* review_scores_ratings (is it better to break it down?)
* cancellation_policy (one-hot)


New variables:
*   VADER sentiment analysis: 
    *   summary
    *   space
    *   description
    *   neighborhood_overview
    *   house_rules

*   host score (subjective, but explain logic behind how score is determined):
    *   superhost
    *   response rate
    *   response time
    *   verified identity
    *   host_since
    *   written things in host_about

* amenities score
    *   available amenities
* avg_review_sentiment, join by listing_id
  *  comments (vader sentiment analysis)
* sd_review_sentiment, join by listing_id
  * comments (vader sentiment analysis)
  





**If predicting September of 2019 prices, is it better to predict using:**

1.   all of 2018
2.   September 2018 (same month, 1yr ago)
3.   August 2019



**Import datasets**

In [2]:
import pandas as pd
import numpy as np


In [3]:
oct18_list = pd.read_csv('airbnb_data/listings10_18.csv')
oct19_list = pd.read_csv('airbnb_data/listings10_19.csv')
sept19_list = pd.read_csv('airbnb_data/listings9_19.csv')
reviews = pd.read_csv('airbnb_data/reviews10_19.csv')


In [4]:
sept19_list.shape

(7933, 106)

In [5]:
sept19_list.iloc[1272]

id                                                                                        4175303
listing_url                                                  https://www.airbnb.com/rooms/4175303
scrape_id                                                                          20190912153044
last_scraped                                                                           2019-09-12
name                                                              Central & lovely full apartment
summary                                         A private-entrance, a wood floored apartment a...
space                                           The main bedroom has king-sized bed. The room ...
description                                     A private-entrance, a wood floored apartment a...
experiences_offered                                                                          none
neighborhood_overview                           We love the centrality of this location.  Haye...
notes               

**One-Hot Encoding**

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [7]:
# price as a float instead of string
oct18_list['price'] = oct18_list['price'].str.replace('$', '').str.replace(',', '').astype(float)
oct19_list['price'] = oct19_list['price'].str.replace('$', '').str.replace(',', '').astype(float)
sept19_list['price'] = sept19_list['price'].str.replace('$', '').str.replace(',', '').astype(float)

In [8]:
# security_deposit as a float instead of string
oct18_list['security_deposit'] = oct18_list['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)
oct19_list['security_deposit'] = oct19_list['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)
sept19_list['security_deposit'] = sept19_list['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)

In [9]:
oct18_list['security_deposit'] = oct18_list['security_deposit'].fillna(0)
oct19_list['security_deposit'] = oct19_list['security_deposit'].fillna(0)
sept19_list['security_deposit'] = sept19_list['security_deposit'].fillna(0)

In [10]:
oct18_list['guests_included'] = oct18_list['guests_included'].astype(float)
oct19_list['guests_included'] = oct19_list['guests_included'].astype(float)
sept19_list['guests_included'] = sept19_list['guests_included'].astype(float)

In [11]:
#bed_type one-hot transformation
bed_type = LabelEncoder()
bed_type.fit(['Real Bed', 'Futon', 'Airbed', 'Pull-out Sofa', 'Couch'])
oct18_list['bed_type_trans'] = bed_type.transform(oct18_list['bed_type'])
oct19_list['bed_type_trans'] = bed_type.transform(oct19_list['bed_type'])
sept19_list['bed_type_trans'] = bed_type.transform(sept19_list['bed_type'])

In [12]:
#cancellation_policy one-hot transformation
cancellation_policy = LabelEncoder()
cancellation_policy.fit(['moderate', 'strict_14_with_grace_period', 'flexible',
       'super_strict_30', 'strict', 'super_strict_60'])
oct18_list['cancellation_policy_trans'] = cancellation_policy.transform(oct18_list['cancellation_policy'])
oct19_list['cancellation_policy_trans'] = cancellation_policy.transform(oct19_list['cancellation_policy'])
sept19_list['cancellation_policy_trans'] = cancellation_policy.transform(sept19_list['cancellation_policy'])

In [13]:
#neighbourhood_cleansed one-hot transformation
neighbourhood_cleansed = LabelEncoder()
neighbourhood_cleansed.fit(['Western Addition', 'Inner Sunset', 'Bernal Heights',
       'Haight Ashbury', 'Mission', 'Potrero Hill', 'Nob Hill',
       'Downtown/Civic Center', 'Castro/Upper Market', 'South of Market',
       'Noe Valley', 'Outer Richmond', 'Presidio Heights', 'Glen Park',
       'Ocean View', 'Pacific Heights', 'Financial District',
       'Twin Peaks', 'Russian Hill', 'Outer Sunset', 'Marina',
       'Inner Richmond', 'Excelsior', 'Seacliff', 'Chinatown',
       'West of Twin Peaks', 'Bayview', 'North Beach', 'Diamond Heights',
       'Outer Mission', 'Parkside', 'Lakeshore', 'Crocker Amazon',
       'Golden Gate Park', 'Visitacion Valley', 'Presidio',
       'Treasure Island/YBI'])
oct18_list['neighbourhood_cleansed_trans'] = neighbourhood_cleansed.transform(oct18_list['neighbourhood_cleansed'])
oct19_list['neighbourhood_cleansed_trans'] = neighbourhood_cleansed.transform(oct19_list['neighbourhood_cleansed'])
sept19_list['neighbourhood_cleansedy_trans'] = neighbourhood_cleansed.transform(sept19_list['neighbourhood_cleansed'])

In [14]:
#room_type one-hot transformation
room_type = LabelEncoder()
room_type.fit(['Entire home/apt', 'Private room', 'Shared room', 'Hotel room'])
oct18_list['room_type_trans'] = room_type.transform(oct18_list['room_type'])
oct19_list['room_type_trans'] = room_type.transform(oct19_list['room_type'])
sept19_list['room_type_trans'] = room_type.transform(sept19_list['room_type'])

In [15]:
#count the number of amenities of each row

oct19_list['len_amenities'] = oct19_list['amenities'].apply(lambda x: len(x.split(',')))
sept19_list['len_amenities'] = sept19_list['amenities'].apply(lambda x: len(x.split(',')))

oct18_list['len_amenities'] = oct18_list['amenities'].apply(lambda x: len(x.split(',')))


In [16]:
#create columns by picking some amenities

amenities = {'tv_trans':'TV', 'internet_trans':'Internet', 
             'wifi_trans':'Wifi', 'kitchen_trans':'Kitchen', 
             'heating_trans':'Heating', 'air_conditioning_trans':'Air conditioning', 
             'dryer_trans':'Dryer', '24hr_checkin_trans':'24-hour check-in', 
             'self_checkin_trans':'Self check-in', 'breakfast_trans':'Breakfast', 
             'washer_trans':'Washer', 'smoke_detector_trans':'Smoke detector', 
             'host_greets_trans':'Host greets you', 'hot_water_trans':'Hot water', 
             'parking_trans':'parking', 'balcony_trans':'Patio or balcony', 
             'garden_trans':'Garden or backyard', 'cooking_trans':'Cooking basics', 
             'BBQ_trans':'BBQ grill', 'oven_trans':'Oven', 
             'stove_trans':'Stove', 'hairdryer_trans':'Hair dryer', 
             'dishwasher_trans':'Dishwasher', 'fridge_trans':'Refrigerator', 
             'coffeemaker_trans':'Coffee maker', 'microwave_trans':'Microwave', 
             'private_bath_trans':'Private bathroom', 'laptop_friendly_trans':'Laptop friendly workspace', 
             'essentials_trans':'Essentials', 'lock_trans':'Lock on bedroom door', 
             'dishes_trans':'Dishes and silverware'}

for a in amenities:
    oct18_list[a] = oct18_list['amenities'].str.contains(amenities[a]).astype(int)
    oct19_list[a] = oct19_list['amenities'].str.contains(amenities[a]).astype(int)
    sept19_list[a] = sept19_list['amenities'].str.contains(amenities[a]).astype(int)

Host Scores

In [17]:
oct18_host = oct18_list[['host_since', 
                         'host_response_time', 
                         'host_response_rate', 
                         'host_is_superhost', 
                         'host_verifications', 
                         'host_has_profile_pic', 
                         'host_identity_verified']]

oct18_h_since = 2018 - pd.to_numeric(oct18_host.host_since.str[:4])
oct18_host['host_since'] = oct18_h_since / max(oct18_h_since) * 25
oct18_h_sh = (oct18_host.host_is_superhost == 't')
oct18_host['host_is_superhost'] = oct18_h_sh * 20
oct18_h_rr = pd.to_numeric(oct18_host.host_response_rate.str[:-1])
oct18_host['host_response_rate'] = oct18_h_rr.fillna(0) / 100 * 10
oct18_h_verif = oct18_host.host_verifications.apply(lambda x: len(x.split(',')))
oct18_host['host_verifications'] = oct18_h_verif / max(oct18_h_verif) * 15
oct18_h_pp = (oct18_host.host_has_profile_pic == 't')
oct18_host['host_has_profile_pic'] = oct18_h_pp * 5
oct18_h_id = (oct18_host.host_identity_verified == 't')
oct18_host['host_identity_verified'] = oct18_h_id * 15

oct18_host.loc[oct18_host['host_response_time'] == 'within an hour'] = 10
oct18_host.loc[oct18_host['host_response_time'] == 'within a few hours'] = 7.5
oct18_host.loc[oct18_host['host_response_time'] == 'within a day'] = 5
oct18_host.loc[oct18_host['host_response_time'] == 'a few days or more'] = 2.5
oct18_host['host_response_time'] = oct18_host.host_response_time.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydat

In [18]:
oct19_host = oct19_list[['host_since', 
                         'host_response_time', 
                         'host_response_rate', 
                         'host_is_superhost', 
                         'host_verifications', 
                         'host_has_profile_pic', 
                         'host_identity_verified']]

oct19_h_since = 2019 - pd.to_numeric(oct19_host.host_since.str.slice(-4))
oct19_host['host_since'] = oct19_h_since / max(oct19_h_since) * 25
oct19_h_sh = (oct19_host.host_is_superhost == 't')
oct19_host['host_is_superhost'] = oct19_h_sh * 20
oct19_h_rr = pd.to_numeric(oct19_host.host_response_rate.str[:-1])
oct19_host['host_response_rate'] = oct19_h_rr.fillna(0) / 100 * 10
oct19_h_verif = oct19_host.host_verifications.apply(lambda x: len(x.split(',')))
oct19_host['host_verifications'] = oct19_h_verif / max(oct19_h_verif) * 15
oct19_h_pp = (oct19_host.host_has_profile_pic == 't')
oct19_host['host_has_profile_pic'] = oct19_h_pp * 5
oct19_h_id = (oct19_host.host_identity_verified == 't')
oct19_host['host_identity_verified'] = oct19_h_id * 15

oct19_host.loc[oct19_host['host_response_time'] == 'within an hour'] = 10
oct19_host.loc[oct19_host['host_response_time'] == 'within a few hours'] = 7.5
oct19_host.loc[oct19_host['host_response_time'] == 'within a day'] = 5
oct19_host.loc[oct19_host['host_response_time'] == 'a few days or more'] = 2.5
oct19_host['host_response_time'] = oct19_host.host_response_time.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydat

In [19]:
sept19_host = sept19_list[['host_since', 
                         'host_response_time', 
                         'host_response_rate', 
                         'host_is_superhost', 
                         'host_verifications', 
                         'host_has_profile_pic', 
                         'host_identity_verified']]

sept19_h_since = 2018 - pd.to_numeric(sept19_host.host_since.str[:4])
sept19_host['host_since'] = sept19_h_since / max(sept19_h_since) * 25
sept19_h_sh = (sept19_host.host_is_superhost == 't')
sept19_host['host_is_superhost'] = sept19_h_sh * 20
sept19_h_rr = pd.to_numeric(sept19_host.host_response_rate.str[:-1])
sept19_host['host_response_rate'] = sept19_h_rr.fillna(0) / 100 * 10
sept19_h_verif = sept19_host.host_verifications.apply(lambda x: len(x.split(',')))
sept19_host['host_verifications'] = sept19_h_verif / max(sept19_h_verif) * 15
sept19_h_pp = (sept19_host.host_has_profile_pic == 't')
sept19_host['host_has_profile_pic'] = sept19_h_pp * 5
sept19_h_id = (sept19_host.host_identity_verified == 't')
sept19_host['host_identity_verified'] = sept19_h_id * 15

sept19_host.loc[sept19_host['host_response_time'] == 'within an hour'] = 10
sept19_host.loc[sept19_host['host_response_time'] == 'within a few hours'] = 7.5
sept19_host.loc[sept19_host['host_response_time'] == 'within a day'] = 5
sept19_host.loc[sept19_host['host_response_time'] == 'a few days or more'] = 2.5
sept19_host['host_response_time'] = sept19_host.host_response_time.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydat

In [20]:
#create column for host_score (out of 100)

oct18_list['host_score'] = oct18_host.sum(axis=1)
oct19_list['host_score'] = oct19_host.sum(axis=1)
sept19_list['host_score'] = sept19_host.sum(axis=1)

VADER Sentiment

In [21]:
#Download VADER database
vader = pd.read_table("vader_lexicon.txt", header = None, index_col = 0)[[1]].rename(columns = {1: 'polarity'})

  


In [22]:
datasets = [oct18_list, oct19_list, sept19_list]
columns = ['summary', 'space', 'description', 'neighborhood_overview', 'house_rules']

for d in datasets:
  for i in columns:
#manipulate 'text' column: lowercase + remove punctuation
    d[i] = d[i].str.lower() #change to lowercase
    punct_re = r'[^\w\s]' #remove punctuation
    d[i] = d[i].str.replace(punct_re, ' ')


In [23]:
for d in datasets:
  for i in columns:
  #tidy format: splits words for aggregation
    tidy_format = (d[i]
                  .str.split(expand = True)
                  .stack()
                  .reset_index(level=1)
                  .rename(columns = {'level_1': 'num', 0: 'word'})
                  )
  #create new 'polarity' colun with polarity for each review
    d[i + '_polarity'] = (tidy_format
                    .merge(vader, how = 'left', left_on = 'word', right_index = True)
                    .reset_index()
                    .loc[:, ['index', 'polarity']]
                    .groupby('index')
                    .sum()
                    .fillna(0)
                    )

In [24]:
#VADER on reviews
#manipulate 'text' column: lowercase + remove punctuation
reviews['comments'] = reviews['comments'].str.lower() #change to lowercase
punct_re = r'[^\w\s]' #remove punctuation
reviews['comments'] = reviews['comments'].str.replace(punct_re, ' ')

#tidy format: splits words for aggregation
tidy_format = (reviews['comments']
              .str.split(expand = True)
              .stack()
              .reset_index(level=1)
              .rename(columns = {'level_1': 'num', 0: 'word'})
              )
#create new 'polarity' colun with polarity for each review
reviews['polarity'] = (tidy_format
                .merge(vader, how = 'left', left_on = 'word', right_index = True)
                .reset_index()
                .loc[:, ['index', 'polarity']]
                .groupby('index')
                .sum()
                .fillna(0)
                )

In [25]:
#join reviews to listing (find avg and stdv)
  #filter reviews to get appropriate ones for each list/timeframe
  #groupby listing_id
  #find avg + stdv for each listing_id
  #left join list + reviews
  
import datetime

reviews['year'] = reviews['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').year)
reviews['month'] = reviews['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').month)

oct19_reviews = reviews[(reviews['year'] == 2019) & (reviews['month'] == 10 )]
sept19_reviews = reviews[(reviews['year'] == 2019) & (reviews['month'] == 9 )]
oct18_reviews = reviews[(reviews['year'] == 2018) & (reviews['month'] == 10 )]


In [26]:
#for oct18:
oct18_groupby = oct18_reviews.groupby('listing_id')
  #average
oct18_avg_series = oct18_groupby.aggregate(np.average)['polarity']
oct18_avg_df = pd.DataFrame({'listing_id':oct18_avg_series.index, 'review_polarity_avg':oct18_avg_series.values})
oct18_list = oct18_list.merge(oct18_avg_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #stdv
oct18_stdv_series = oct18_groupby.aggregate(np.std)['polarity']
oct18_stdv_df = pd.DataFrame({'listing_id':oct18_stdv_series.index, 'review_polarity_stdv':oct18_stdv_series.values})
oct18_list = oct18_list.merge(oct18_stdv_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #drop listing_id_x and listing_id_y columns
oct18_list.drop(columns = ['listing_id_x', 'listing_id_y'], inplace = True)

#for oct19
oct19_groupby = oct19_reviews.groupby('listing_id')
  #average
oct19_avg_series = oct19_groupby.aggregate(np.average)['polarity']
oct19_avg_df = pd.DataFrame({'listing_id':oct19_avg_series.index, 'review_polarity_avg':oct19_avg_series.values})
oct19_list = oct19_list.merge(oct19_avg_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #stdv
oct19_stdv_series = oct19_groupby.aggregate(np.std)['polarity']
oct19_stdv_df = pd.DataFrame({'listing_id':oct19_stdv_series.index, 'review_polarity_stdv':oct19_stdv_series.values})
oct19_list = oct19_list.merge(oct19_stdv_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #drop listing_id_x and listing_id_y columns
oct19_list.drop(columns = ['listing_id_x', 'listing_id_y'], inplace = True)

#for sept19
sept19_groupby = sept19_reviews.groupby('listing_id')
  #average
sept19_avg_series = sept19_groupby.aggregate(np.average)['polarity']
sept19_avg_df = pd.DataFrame({'listing_id':sept19_avg_series.index, 'review_polarity_avg':sept19_avg_series.values})
sept19_list = sept19_list.merge(sept19_avg_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #stadv
sept19_stdv_series = sept19_groupby.aggregate(np.std)['polarity']
sept19_stdv_df = pd.DataFrame({'listing_id':sept19_stdv_series.index, 'review_polarity_stdv':sept19_stdv_series.values})
sept19_list = sept19_list.merge(sept19_stdv_df, left_on = 'id', right_on = 'listing_id', how = 'left')
  #drop listing_id_x and listing_id_y columns
sept19_list.drop(columns = ['listing_id_x', 'listing_id_y'], inplace = True)

In [27]:
sept19_list.iloc[[1272]]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,lock_trans,dishes_trans,host_score,summary_polarity,space_polarity,description_polarity,neighborhood_overview_polarity,house_rules_polarity,review_polarity_avg,review_polarity_stdv
1272,4175303,https://www.airbnb.com/rooms/4175303,20190912153044,2019-09-12,Central & lovely full apartment,a private entrance a wood floored apartment a...,the main bedroom has king sized bed the room ...,a private entrance a wood floored apartment a...,none,we love the centrality of this location haye...,...,0,1,40.0,0.0,9.2,8.1,7.4,-1.2,,


In [28]:
# fill polarity with average
oct18_list['neighborhood_overview_polarity'] = oct18_list['neighborhood_overview_polarity'].fillna(oct18_list['neighborhood_overview_polarity'].mean())
oct19_list['neighborhood_overview_polarity'] = oct19_list['neighborhood_overview_polarity'].fillna(oct19_list['neighborhood_overview_polarity'].mean())
sept19_list['neighborhood_overview_polarity'] = sept19_list['neighborhood_overview_polarity'].fillna(sept19_list['neighborhood_overview_polarity'].mean())

oct18_list['summary_polarity'] = oct18_list['summary_polarity'].fillna(oct18_list['summary_polarity'].mean())
oct19_list['summary_polarity'] = oct19_list['summary_polarity'].fillna(oct19_list['summary_polarity'].mean())
sept19_list['summary_polarity'] = sept19_list['summary_polarity'].fillna(sept19_list['summary_polarity'].mean())

oct18_list['space_polarity'] = oct18_list['space_polarity'].fillna(oct18_list['space_polarity'].mean())
oct19_list['space_polarity'] = oct19_list['space_polarity'].fillna(oct19_list['space_polarity'].mean())
sept19_list['space_polarity'] = sept19_list['space_polarity'].fillna(sept19_list['space_polarity'].mean())

oct18_list['description_polarity'] = oct18_list['description_polarity'].fillna(oct18_list['description_polarity'].mean())
oct19_list['description_polarity'] = oct19_list['description_polarity'].fillna(oct19_list['description_polarity'].mean())
sept19_list['description_polarity'] = sept19_list['description_polarity'].fillna(sept19_list['description_polarity'].mean())

oct18_list['house_rules_polarity'] = oct18_list['house_rules_polarity'].fillna(oct18_list['house_rules_polarity'].mean())
oct19_list['house_rules_polarity'] = oct19_list['house_rules_polarity'].fillna(oct19_list['house_rules_polarity'].mean())
sept19_list['house_rules_polarity'] = sept19_list['house_rules_polarity'].fillna(sept19_list['house_rules_polarity'].mean())

oct18_list['review_polarity_avg'] = oct18_list['review_polarity_avg'].fillna(oct18_list['review_polarity_avg'].mean())
oct19_list['review_polarity_avg'] = oct19_list['review_polarity_avg'].fillna(oct19_list['review_polarity_avg'].mean())
sept19_list['review_polarity_avg'] = sept19_list['review_polarity_avg'].fillna(sept19_list['review_polarity_avg'].mean())

oct18_list['review_polarity_stdv'] = oct18_list['review_polarity_stdv'].fillna(oct18_list['review_polarity_stdv'].mean())
oct19_list['review_polarity_stdv'] = oct19_list['review_polarity_stdv'].fillna(oct19_list['review_polarity_stdv'].mean())
sept19_list['review_polarity_stdv'] = sept19_list['review_polarity_stdv'].fillna(sept19_list['review_polarity_stdv'].mean())


In [29]:
oct19_list.to_csv("oct19-cleaned.csv", index = False)

In [30]:
oct18_list.to_csv("oct18-cleaned.csv", index = False)

In [31]:
sept19_list.to_csv("sept19-cleaned.csv", index = False)