# **GIS 322 Final Project**
### Autumn Towne

---



# Coding environnment preparation

## Library installation and importing

In [1]:
!pip install geopandas
!sudo apt install python3-rtree
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from shapely.geometry import Point, shape

import numpy as np
from bokeh.io import output_file, show,output_notebook
from bokeh.models import ColumnDataSource,ColorBar,HoverTool
from bokeh.transform import linear_cmap
from bokeh.plotting import figure
from bokeh.palettes import OrRd, Oranges, RdPu, YlGn, YlOrBr, Purples, Blues, Spectral6, YlOrRd, Plasma
import colorcet as cc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-rtree is already the newest version (0.9.7-1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


## Reading the data in

In [2]:
listings = gpd.read_file('/content/listings.csv')

In [3]:
neighborhoods_geometry = gpd.read_file('/content/neighbourhoods.geojson')

# Data Cleaning

In [4]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

I prepared the price, overall rating, location rating, and "value" rating variables to be valid parameters for subsequent numeric operations.

*   Removed the '$" and "," characters from the price column
*   Converted data to numeric type and removed NA values for all target columns.

In [5]:
listings['price'] = listings['price'].str.replace('$', '').str.replace(',', '')

listings['price'] = pd.to_numeric(
  listings['price'])

listings['review_scores_rating'] = pd.to_numeric(
  listings['review_scores_rating'])

listings['review_scores_location'] = pd.to_numeric(
  listings['review_scores_location'])

listings['review_scores_value'] = pd.to_numeric(
  listings['review_scores_value'])

listings = listings.dropna(subset=[
  'review_scores_rating', 'price', 'review_scores_location',
  'review_scores_value', 'neighbourhood_cleansed'])

Next, I renamed the columns to be more understandable.

In [6]:
listings = listings.rename(columns={
    'price': 'Price per Night',
    'review_scores_rating': 'Overall Rating',
    'review_scores_location': 'Location Rating',
    'review_scores_value': 'Value Rating',
    'neighbourhood_cleansed': 'Neighborhood',
    'neighbourhood_group_cleansed' : 'Borough',
    'neighborhood_overview': 'Neighborhood Description'
})

neighborhoods_geometry = neighborhoods_geometry.rename(columns={
    'neighbourhood': 'Neighborhood',
    'neighbourhood_group' : 'Borough'
})

# Data Analysis

## Prices and Ratings

I calculated the average price, overall rating, location rating, and value rating of listings in each neighborhood.

In [7]:
neighborhood_stats = listings.groupby('Neighborhood').agg(
    {'Price per Night': 'mean',
     'Overall Rating': 'mean',
     'Location Rating': 'mean',
     'Value Rating': 'mean'}
)

neighborhood_stats = neighborhood_stats.round(2)

In [8]:
metrics_spatially = neighborhoods_geometry.merge(neighborhood_stats,
                                                on = 'Neighborhood')

### Tables of the Neighborhoods Sorted by Highest Price and Ratings

Shortened variable name and number of decimal places for better readability.

In [9]:
shortened_nb_stats = neighborhood_stats

shortened_nb_stats = shortened_nb_stats.rename(columns={
    'Price per Night': 'Price'})

Printed Neighborhoods Sorted:

Eight neighborhoods tied with a perfect Overall Rating (5.0), so I included them all.

Exactly 5 neighborhoods had a perfect Location Rating.

In [10]:
print("Most Expensive Neighborhoods by Average Price ($USD)\n")
print(shortened_nb_stats.sort_values(by='Price', ascending=False).head(5), "\n")

print("\nNeighborhoods with a 5/5 Overall Rating:\n")
print(shortened_nb_stats.sort_values(by='Overall Rating', ascending=False).head(8))

print("\nTop 5 Neighborhoods by Location Rating:\n")
print(shortened_nb_stats.sort_values(by='Location Rating', ascending=False).head(5))

print("\nTop 5 Neighborhoods by Value Rating:")
print(shortened_nb_stats.sort_values(by='Value Rating', ascending=False).head(5))

Most Expensive Neighborhoods by Average Price ($USD)

                    Price  Overall Rating  Location Rating  Value Rating
Neighborhood                                                            
Todt Hill          518.00            5.00             4.97          4.97
Longwood           494.96            4.81             4.63          4.81
Greenwich Village  377.89            4.81             4.95          4.65
Tribeca            371.25            4.68             4.82          4.48
SoHo               362.89            4.79             4.89          4.61 


Neighborhoods with a 5/5 Overall Rating:

                        Price  Overall Rating  Location Rating  Value Rating
Neighborhood                                                                
Chelsea, Staten Island   70.0            5.00             5.00          5.00
Riverdale               178.0            5.00             4.95          4.82
Woodrow                  58.5            5.00             5.00          4.84
West 

## Keywords

For each listing, a "Neighborhood Description" written by the host of the listing. The data is in String format with a large variation in length and content.

To analyze these qualitative descriptions of NYC neighborhoods, I searched for keywords.

For each keyword, I calculated the proportion of listings containing that word in each neighborhood.

**Method:**
I created a dictionary called "keyword_proportions" to store the data.

Next, I looped through the neighborhoods. For each neighborhood, I looped through the keywords of interest. For each keyword, I calculated how many listings in the neighborhood contained that word in their neighborhood description, listing description, or listing name (some hosts write descriptions in the listing name). **I made the count case *in*sensitive (so Quiet and quiet would both count as 1)** and set NAs to equal 0.


In [11]:
keywords = ['Close', 'Central', 'Quiet', 'Lively', 'Walkable', 'Safe',
          'Historic', 'Bustling', 'Busy']

keyword_proportions = {}

for neighborhood, neighborhood_listings in listings.groupby('Neighborhood'):
    keyword_proportions[neighborhood] = {}
    num_listings = len(neighborhood_listings)

    for word in keywords:
        word_count = neighborhood_listings[
            neighborhood_listings['Neighborhood Description'].str.contains(word, case=False, na=False) |
            neighborhood_listings['name'].str.contains(word, case=False, na=False) | neighborhood_listings['description'].str.contains(word, case =False, na= False)].shape[0]

        proportion = word_count / num_listings if num_listings else 0
        keyword_proportions[neighborhood][word] = proportion

Once my dictionary stored all my desired data, I converted it to a dataframe.

In [12]:
keyword_proportions_df = pd.DataFrame(keyword_proportions).T.reset_index()
keyword_proportions_df.rename(columns={'index': 'Neighborhood'}, inplace=True)

### For each word, here are the top 5 neighborhoods by the proportion of listings in that neighborhood containing that keyword in their "Neighborhood Description":

Preprocess Data for Readability:

In [13]:
keyword_proportions_df = keyword_proportions_df.round(2)

Print top 5 neighborhoods for each keyword:

In [14]:
for word in keywords:
    print(f"\nBest Neighborhoods for '{word}':\n")
    top_neighborhoods = keyword_proportions_df.sort_values(by=word, ascending=False).head(5)
    print(top_neighborhoods[['Neighborhood', word]])


Best Neighborhoods for 'Close':

               Neighborhood  Close
32        Castleton Corners    1.0
83          Gerritsen Beach    1.0
46                  Concord    1.0
34   Chelsea, Staten Island    1.0
138               Navy Yard    1.0

Best Neighborhoods for 'Central':

         Neighborhood  Central
164         Riverdale     1.00
198   Upper West Side     0.80
187  Theater District     0.78
197   Upper East Side     0.74
137       Murray Hill     0.73

Best Neighborhoods for 'Quiet':

               Neighborhood  Quiet
34   Chelsea, Staten Island    1.0
142         New Springville    1.0
114         Lighthouse Hill    1.0
207             Westerleigh    1.0
138               Navy Yard    1.0

Best Neighborhoods for 'Lively':

       Neighborhood  Lively
81    Fort Hamilton    0.31
78          Fordham    0.25
11        Bayswater    0.25
121     Marble Hill    0.25
131  Morris Heights    0.11

Best Neighborhoods for 'Walkable':

     Neighborhood  Walkable
37   Civic Center     

Next, I updated column names for better formatting in the visualization outputs.

In [15]:
keyword_proportions_df.rename(columns={'index': 'Neighborhood',
                                       'close': 'Close', 'central': 'Central', 'quiet': 'Quiet', 'lively': 'Lively', 'walkable': 'Walkable', 'safe': 'Safe','bustling': 'Bustling', 'busy': 'Busy', 'historic': 'Historic'}, inplace=True)

In [16]:
keywords_spatially = neighborhoods_geometry.merge(keyword_proportions_df,
                                                on = 'Neighborhood')

Next, I grouped the descriptions by 'neighbourhood_cleansed' . for each neighborhood, for each word, count how many listings contain that word in the 'neighborhood_overview' column. divide that by the total number of listings in that neighborhood. then, for each word, list the 5 neighborhoods with the highest proportion of that word being in neighborhood_overview


# Data Visualization with Interactive Maps

### Setup

In [17]:
output_notebook()

I pasted the helper function from Module 6 to convert GeoDataFrame to a format bokeh can use.

In [18]:
def gpd_bokeh(df):
    """Convert geometries from geopandas to bokeh format"""
    nan = float('nan')
    lons = []
    lats = []
    for i,shape in enumerate(df.geometry.values):
        if shape.geom_type == 'MultiPolygon':
            gx = []
            gy = []
            ng = len(shape.geoms) - 1
            for j,member in enumerate(shape.geoms):
                xy = np.array(list(member.exterior.coords))
                xs = xy[:,0].tolist()
                ys = xy[:,1].tolist()
                gx.extend(xs)
                gy.extend(ys)
                if j < ng:
                    gx.append(nan)
                    gy.append(nan)
            lons.append(gx)
            lats.append(gy)

        else:
            xy = np.array(list(shape.exterior.coords))
            xs = xy[:,0].tolist()
            ys = xy[:,1].tolist()
            lons.append(xs)
            lats.append(ys)

    return lons,lats

In [19]:
TOOLS = "pan,wheel_zoom,reset,hover,save"

## Bokeh Maps

### Price and Ratings Visualizations:

I chose color palettes from the colorcet package. For best visibility, I set the minimum and maximum of each metric as the lowest and highest colors, respectively, of each palette. I used a loop to make my code efficient. I removed the x and y axes because they displayed longitude and latitude - numbers that clutter the map and are not the focus.

In [20]:
metrics = [
  'Price per Night', 'Overall Rating', 'Location Rating',
  'Value Rating']

def p_palettes(metric):
  if metric == 'Price per Night':
    return list(reversed(cc.palette["linear_kgy_5_95_c69"]))
  else:
    return Plasma[9]

for metric in metrics:
  lons, lats = gpd_bokeh(metrics_spatially)
  source = ColumnDataSource(data=dict(
      x=lons,
      y=lats,
      bk_neighborhood = metrics_spatially['Neighborhood'],
      target_metric = metrics_spatially[metric]))

  color_mapper = linear_cmap(field_name='target_metric', palette= p_palettes(metric),
                           low=metrics_spatially[metric].min(),
                           high=metrics_spatially[metric].max())

  map = figure(frame_width=800, frame_height=600,title=
              "NYC Neighborhoods by Average Airbnb " + metric, tools=TOOLS,)
  map.patches('x', 'y', source=source, line_color="white", line_width=0.1,
            color=color_mapper)
  if metric == 'Price per Night':
    map.select_one(HoverTool).tooltips = [('Average ' + metric + " $",'@target_metric'), ('Neighborhood', '@bk_neighborhood')]
  else:
    map.select_one(HoverTool).tooltips = [('Average ' + metric,'@target_metric'), ('Neighborhood', '@bk_neighborhood')]

  map.xaxis.visible = False
  map.yaxis.visible = False

  color_bar = ColorBar(color_mapper=color_mapper['transform'], width=16,
                      location=(0,0))
  map.add_layout(color_bar, 'right')
  file_name = "nyc_bnb_bokeh_by_" + metric + ".html"
  output_file(file_name)
  show(map)

### Keywords Visualizations

First, I reduced my target keywords. The other keywords either didn't have enough prevalence among the neighborhoods, were spatially biased (i.e. "central" could be spatially biased to appear around central park), or could describe most neighborhoods (most of New York is historic, walkable, and close to something - even if that something is a train station).

Next, I selected color palettes to mirror the idea of the word. For example "safe" used muted greens while "bustling" used warm colors: yellow, orange, and red.



In [21]:
keywords_focused = ['Quiet', 'Safe', 'Bustling', 'Lively']

for word in keywords_focused:
  keywords_spatially[word] = keywords_spatially[word].fillna(0)

def get_palette(word):
    if word == 'Bustling':
      return list(reversed(YlOrRd[9]))
    elif word == 'Lively':
      return Plasma[9]
    elif word == 'Quiet':
        return cc.palette["linear_blue_95_50_c20"]
    elif word == 'Safe':
        return list(reversed(YlGn[9]))

for word in keywords_focused:
  lons, lats = gpd_bokeh(keywords_spatially)
  source = ColumnDataSource(data=dict(
      x=lons,
      y=lats,
      s_neighborhood = keywords_spatially['Neighborhood'],
      percent = keywords_spatially[word]*100))

  color_mapper = linear_cmap(field_name='percent',
                             palette= get_palette(word),
                             low = keywords_spatially[word].min()*100,
                             high = keywords_spatially[word].max()*100)
  map = figure(frame_width=800, frame_height=600,title=
               "Percent of Airbnb Listings with \""+ word +"\" in their description, by NYC Neighborhood", tools= TOOLS)

  map.patches('x', 'y', source=source, line_color="white", line_width=0.1,
              color=color_mapper)

  map.xaxis.visible = False
  map.yaxis.visible = False

  map.select_one(HoverTool).tooltips = [
      ('Percent', '@percent%'),
      ('Neighborhood', '@s_neighborhood'),
      ]

  color_bar = ColorBar(color_mapper=color_mapper['transform'], width=16,
                       location=(0,0))

  map.add_layout(color_bar, 'right')
  file_name = "nyc_neighborhoods" + word + ".html"
  output_file(file_name)
  show(map)


## Folium Map

Lastly, I created an interactive Folium map so that users can zoom in to the neighborhoods of New York City, see the exact locations of each Airbnb, colored by keyword.

In [22]:
import folium

nyc_bnbs = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

def get_color(word):
    if word == 'Bustling':
      return 'red'
    elif word == 'Lively':
      return 'yellow'
    elif word == 'Quiet':
        return 'blue'
    elif word == 'Safe':
        return 'green'

for keyword in keywords_focused:
    keyword_listings = listings[listings['Neighborhood Description'].str.contains(keyword, case=False, na=False)]

    for index, row in keyword_listings.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=3,
            color= get_color(keyword),
            fill=True,
            fill_color= get_color(keyword),
            fill_opacity=0.6,
            popup=f"{keyword} - {row['name'] if 'name' in row else 'Listing'}"
        ).add_to(nyc_bnbs)

nyc_bnbs.save("nyc_airbnb_keywords_folium.html")

nyc_bnbs
