In [1]:
#This file begins the data clensing and main dataframe creation process for the EDA Analysis

import pandas as pd
import numpy as np
import ast
import json
from datetime import datetime

In [2]:
#This code reads in a csv file of rent postings created in the 'RentHop Web Scrape File.ipynb' file

df = pd.read_csv('cleaner_list_of_summary_rent_listings.csv', engine='python')

df.tail()


Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,unit,url
6654,247 W 87th St,1.0,Studio,Manhattan,"['No Fee', 'Exclusive', 'Doorman', 'Elevator',...",0.04,Upper West Side,3450,0,Apt 11H,https://www.renthop.com/listings/247-w-87th-st...
6655,Essex street studio!!,1.0,Studio,Manhattan,[''],0.12,Chinatown,1850,0,,https://www.renthop.com/listings/essex-street-...
6656,West 30th st,2.0,3,Manhattan,"['Elevator', 'Private Outdoor Space', 'Elevato...",0.22,Chelsea,5000,0,,https://www.renthop.com/listings/west-30th-st/...
6657,Center Boulevard,1.0,1,Queens,"['Doorman', 'Elevator', 'Doorman', 'Fitness Ce...",0.34,Hunters Point,3387,0,,https://www.renthop.com/listings/center-boulev...
6658,East 2nd Street,1.0,2,Manhattan,"['No Fee', 'Doorman', 'Elevator', 'Laundry in ...",0.42,Alphabet City,3700,0,,https://www.renthop.com/listings/east-2nd-stre...


In [3]:
#This column replaces values in the row that were not correctly entered
df.neighborhood = df.neighborhood.replace({' East Village': 'East Village', ' Financial District': 'Financial District', ' Gowanus': 'Gowanus', " Hell's Kitchen": "Hell's Kitchen", ' Lower East Side': 'Lower East Side', ' Upper East Side': 'Upper East Side', ' Upper West Side': 'Upper West Side', ' West Village': 'West Village', ' Williamsburg': 'Williamsburg'})


In [4]:
#This block of code creates a list of unique values in the neighborhoods column
neighborhoods = list(df.neighborhood.unique())
neighborhoods.sort()
neighborhoods

['Alphabet City',
 'Arverne',
 'Astoria',
 'Astoria Heights',
 'Auburndale',
 'Battery Park City',
 'Bay Ridge',
 'Bedford-Stuyvesant',
 'Bensonhurst',
 'Bergen Beach',
 'Beverley Square East',
 'Bithlo',
 'Boerum Hill',
 'Borough Park',
 'Bowery',
 'Briarwood',
 'Bronxwood',
 'Brooklyn Heights',
 'Bushwick',
 'Carnegie Hill',
 'Carroll Gardens',
 'Central Harlem',
 'Central Park',
 'Central Riverdale',
 'Central Slope',
 'Chelsea',
 'Chinatown',
 'Civic Center',
 'Clarion',
 'Clinton Hill',
 'Cobble Hill',
 'College Point',
 'Crown Heights',
 'DUMBO',
 'Ditmars',
 'Downtown Brooklyn',
 'Downtown Flushing',
 'East Flatbush',
 'East Harlem',
 'East New York',
 'East Village',
 'East Williamsburg',
 'Elmhurst',
 'Far Rockaway',
 'Financial District',
 'Flatbush',
 'Flatiron District',
 'Flushing',
 'Fordham Heights',
 'Fordham Manor',
 'Forest Hills',
 'Fort George',
 'Fort Greene',
 'Fresh Meadows',
 'Garment District',
 'Glendale',
 'Governors Island',
 'Gowanus',
 'Gramercy Park',
 'G

In [5]:
#This code imports the zipcodes and neigborhoods csv
zipdf = pd.read_csv('zip_code_info.csv', engine='python')

In [6]:
zipdf.head()

Unnamed: 0,median_income,neighborhood,percent_high_earner,zipcode
0,35449.0,Alphabet City,6.4,10002.0
1,45816.0,Arverne,3.0,11692.0
2,57866.0,Astoria,11.9,11101.0
3,59901.0,Astoria Heights,4.5,11370.0
4,65932.0,Auburndale,8.4,11358.0


In [7]:
#This code creates the main dataframe by merging the two smaller data frames based on their neighborhood
main_df = pd.merge(df, zipdf, on='neighborhood', how='outer')

In [8]:
main_df.tail()

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,unit,url,median_income,percent_high_earner,zipcode
6654,2368 West Street,1.0,2,Brooklyn,"['Exclusive', '', 'Terrace', 'Stainless Steel ...",0.22,Gravesend,2200,0,Apt 2,https://www.renthop.com/listings/2368-west-str...,46310.0,5.1,11223.0
6655,tibbett avenue,2.0,2,Manhattan,"['', 'Reduced Fee']",0.29,Marble Hill,2200,0,,https://www.renthop.com/listings/tibbett-avenu...,58881.0,6.3,10463.0
6656,tibbett avenue,2.0,2,Manhattan,"['', 'Reduced Fee']",0.29,Marble Hill,2200,0,,https://www.renthop.com/listings/tibbett-avenu...,58881.0,6.3,10463.0
6657,43rd Ave.,1.0,Studio,Queens,"['Elevator', 'Hardwood Floors', 'Elevator', 'L...",,Auburndale,1595,0,,https://www.renthop.com/listings/43rd-ave/102c...,65932.0,8.4,11358.0
6658,164th Street,1.0,2,Queens,"['Hardwood Floors', 'Dining Room', 'Dishwasher...",,Pomonok,2000,0,,https://www.renthop.com/listings/164th-street/...,63455.0,5.6,11365.0


In [9]:
main_df.isnull().sum()

address                  65
bath                      0
bedrooms                  0
borough                   0
extra                     0
nearest_subway          116
neighborhood              0
rent                      0
sqft                      0
unit                   4612
url                       0
median_income            42
percent_high_earner      42
zipcode                  42
dtype: int64

In [10]:
#This for loop prints the index number and column where a cell equals a specific value
for col in main_df.columns:
    lst = list(main_df.loc[main_df[col] == 'Queens'].index)
    print(col)
    print(lst)

address
[]
bath
[]
bedrooms
[]
borough
[614, 615, 616, 628, 751, 752, 804, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 2862, 2863, 2864, 2865, 4629, 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, 4643, 4644, 4645, 4646, 4647, 4648, 4649, 4650, 4651, 4652, 4653, 4654, 4655, 4656, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, 4679, 4680, 4

  result = method(y)


In [11]:
#This cell drops the url and unit columns
main_df = main_df.drop(['url', 'unit'], axis = 1)
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
0,453 Milford Street,1.0,3,Brooklyn,"['Exclusive', '', 'Featured', 'Exclusive', 'Ca...",0.50,New Lots,2200,0,63700.0,4.4,11236.0
1,37 King St,1.0,1,Manhattan,"['Exclusive', 'Elevator', 'Featured', 'Exclusi...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0
2,Hudson St,2.0,2,Manhattan,[''],0.18,Hudson Square,9395,0,106056.0,33.4,10013.0
3,600 West 42nd Street,1.0,1,Manhattan,"['Exclusive', 'By Owner', 'Doorman', 'Elevator...",0.38,Hell's Kitchen,4300,0,93707.0,19.8,10036.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
7,312 W 58th Street,2.0,4,Manhattan,"['No Fee', 'Exclusive', 'Laundry in Unit', 'No...",0.08,Hell's Kitchen,7150,1,93707.0,19.8,10036.0
8,321 W 47th St,2.0,2,Manhattan,"['No Fee', 'Exclusive', '', 'No Fee', 'Feature...",0.17,Hell's Kitchen,3500,0,93707.0,19.8,10036.0
9,524 West 50th Street,1.0,Studio,Manhattan,"['Exclusive', 'Featured', 'Laundry in Building...",0.41,Hell's Kitchen,1999,500,93707.0,19.8,10036.0


In [12]:
#This cell drops rows where the nearest_subway and median income column values are null
main_df.dropna(subset = ['nearest_subway', 'median_income'],axis = 0, inplace = True)
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
0,453 Milford Street,1.0,3,Brooklyn,"['Exclusive', '', 'Featured', 'Exclusive', 'Ca...",0.50,New Lots,2200,0,63700.0,4.4,11236.0
1,37 King St,1.0,1,Manhattan,"['Exclusive', 'Elevator', 'Featured', 'Exclusi...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0
2,Hudson St,2.0,2,Manhattan,[''],0.18,Hudson Square,9395,0,106056.0,33.4,10013.0
3,600 West 42nd Street,1.0,1,Manhattan,"['Exclusive', 'By Owner', 'Doorman', 'Elevator...",0.38,Hell's Kitchen,4300,0,93707.0,19.8,10036.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
7,312 W 58th Street,2.0,4,Manhattan,"['No Fee', 'Exclusive', 'Laundry in Unit', 'No...",0.08,Hell's Kitchen,7150,1,93707.0,19.8,10036.0
8,321 W 47th St,2.0,2,Manhattan,"['No Fee', 'Exclusive', '', 'No Fee', 'Feature...",0.17,Hell's Kitchen,3500,0,93707.0,19.8,10036.0
9,524 West 50th Street,1.0,Studio,Manhattan,"['Exclusive', 'Featured', 'Laundry in Building...",0.41,Hell's Kitchen,1999,500,93707.0,19.8,10036.0


In [13]:
main_df.isnull().sum()

address                65
bath                    0
bedrooms                0
borough                 0
extra                   0
nearest_subway          0
neighborhood            0
rent                    0
sqft                    0
median_income           0
percent_high_earner     0
zipcode                 0
dtype: int64

In [14]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6538 entries, 0 to 6656
Data columns (total 12 columns):
address                6473 non-null object
bath                   6538 non-null float64
bedrooms               6538 non-null object
borough                6538 non-null object
extra                  6538 non-null object
nearest_subway         6538 non-null float64
neighborhood           6538 non-null object
rent                   6538 non-null int64
sqft                   6538 non-null int64
median_income          6538 non-null float64
percent_high_earner    6538 non-null float64
zipcode                6538 non-null float64
dtypes: float64(5), int64(2), object(5)
memory usage: 664.0+ KB


In [15]:
main_df.head()

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
0,453 Milford Street,1.0,3,Brooklyn,"['Exclusive', '', 'Featured', 'Exclusive', 'Ca...",0.5,New Lots,2200,0,63700.0,4.4,11236.0
1,37 King St,1.0,1,Manhattan,"['Exclusive', 'Elevator', 'Featured', 'Exclusi...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0
2,Hudson St,2.0,2,Manhattan,[''],0.18,Hudson Square,9395,0,106056.0,33.4,10013.0
3,600 West 42nd Street,1.0,1,Manhattan,"['Exclusive', 'By Owner', 'Doorman', 'Elevator...",0.38,Hell's Kitchen,4300,0,93707.0,19.8,10036.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"['Exclusive', 'Pre-War', 'Featured', 'Exclusiv...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0


In [17]:
#This cell iterates through each row in the extra's column and removes blank values from that column
#The ast.literal_eval(list) syntax from the ast library evaluates a string in pandas as an actual python list
#and converts the string to a list permanently
#Set is used here because the amenities list was pulled from both the summary page and the individual posting

for i, row in main_df.iterrows():
    main_df.at[i,'extra'] = list(filter(None, list(set(ast.literal_eval(main_df.extra[i])))))
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
0,453 Milford Street,1.0,3,Brooklyn,"[Dishwasher, Featured, Microwave, Dogs Allowed...",0.50,New Lots,2200,0,63700.0,4.4,11236.0
1,37 King St,1.0,1,Manhattan,"[Pre-war Building, Elevator, Renovated, Dishwa...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0
2,Hudson St,2.0,2,Manhattan,[],0.18,Hudson Square,9395,0,106056.0,33.4,10013.0
3,600 West 42nd Street,1.0,1,Manhattan,"[Doorman, Fitness Center, Floorplans Available...",0.38,Hell's Kitchen,4300,0,93707.0,19.8,10036.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Reduced Fee, Featured, brand new renovations,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Reduced Fee, Featured, brand new renovations,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Reduced Fee, Featured, brand new renovations,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
7,312 W 58th Street,2.0,4,Manhattan,"[Laundry In Building, No Fee, Featured, Laundr...",0.08,Hell's Kitchen,7150,1,93707.0,19.8,10036.0
8,321 W 47th St,2.0,2,Manhattan,"[Exposed Brick, Dishwasher, No Fee, Featured, ...",0.17,Hell's Kitchen,3500,0,93707.0,19.8,10036.0
9,524 West 50th Street,1.0,Studio,Manhattan,"[Laundry in Building, Laundry In Building, Pre...",0.41,Hell's Kitchen,1999,500,93707.0,19.8,10036.0


In [18]:
#This block of code creates a list of unique amenity values from the extra's column
all_amenities = []

for row in list(main_df.extra):
    all_amenities +=row

list(set(all_amenities))

['Cold Storage',
 'High Speed Internet',
 'Atm Machine',
 'ROOF TOP!',
 'Pets On Approval',
 'stone countertops',
 'A lot of light',
 'Assigned Parking',
 'Sprinkler System',
 'R',
 'pet friendly',
 'Steps to the E',
 'Landscaped Courtyard',
 'Tons Of Light',
 'Elevator\n \n\nFitness Center',
 'Two Blocks from L train at Morgan Ave',
 'Laundry in Apartment',
 'Sponsor Unit',
 'On-site Parking',
 'High Ceilings Hardwood Floors Video Intercom Natural Light in Living Area Outdoor Space Private backyard Stainless Steel Appliances Duplex Basement Roof access Laundry in building Exposed Brick',
 'Hot Tub',
 'Roof sundeck',
 'Conveniently located minutes',
 'Common Garden',
 'Resident manager',
 'updated bathroom',
 "Chef's Catering Kitchen",
 'Elevator building',
 'Renovated Kitchen',
 'POOL',
 'Floor To Ceiling Windows',
 'Fios Gigabit Connection',
 'Hamman With Plunge Pool',
 'Laundry on every floor',
 'ATM in the building',
 'Yard',
 'CONCIERGE',
 'Direct Feed',
 'Convertible',
 'Washer &

In [19]:
len(list(set(all_amenities)))

1364

In [20]:
#I used the counter library to create a count of the values from the amenities list and their frequencies
from collections import Counter
count_amenity = Counter(list(all_amenities))
count_amenity.most_common()

[('Cats Allowed', 4186),
 ('Elevator', 4064),
 ('No Fee', 4059),
 ('Dogs Allowed', 3884),
 ('Laundry In Building', 3764),
 ('Doorman', 3248),
 ('Dishwasher', 2576),
 ('Hardwood Floors', 2538),
 ('Fitness Center', 2519),
 ('Renovated', 2234),
 ('Light', 2177),
 ('Common Outdoor Space', 2159),
 ('High Ceilings', 1950),
 ('Laundry in Unit', 1865),
 ('Granite Kitchen', 1782),
 ('Laundry In Unit', 1769),
 ('Marble Bath', 1758),
 ('Subway', 1482),
 ('Exclusive', 1352),
 ('Featured', 1230),
 ('Garage', 1196),
 ('Floorplans Available', 1169),
 ('Valet', 1037),
 ('Pre-War', 1032),
 ('Eat In Kitchen', 1023),
 ('Walk In Closet', 1001),
 ('Deck', 934),
 ('Lounge', 914),
 ('Pre-war', 909),
 ('Concierge', 900),
 ('Storage Facility', 812),
 ('High Speed Internet', 777),
 ('Stainless Steel Appliances', 739),
 ('Outdoor Space', 732),
 ('Bicycle Room', 713),
 ('Dining Room', 681),
 ('Storage', 592),
 ('Diplomats Ok', 550),
 ('Receiving Room', 549),
 ('Microwave', 527),
 ('Private Outdoor Space', 492),
 

In [21]:
main_df.loc[main_df.index==1]

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
1,37 King St,1.0,1,Manhattan,"[Pre-war Building, Elevator, Renovated, Dishwa...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0


In [22]:
#The following code will make the elements of the extras column uniform by making all elements title case, removing spaces from the first line, ect.

for i, row in main_df.iterrows():

    main_df.at[i,'extra'] = list(set(list(map(lambda element: str(element.strip().title().replace('-', ' ')), main_df.extra[i]))))
    #test.at[i,'extra'] = list(filter(None, list(set(ast.literal_eval(test.extra[i])))))
    #the above cleaned duplicates in the extras column
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,percent_high_earner,zipcode
0,453 Milford Street,1.0,3,Brooklyn,"[Dishwasher, Exclusive, Featured, Dogs Allowed...",0.50,New Lots,2200,0,63700.0,4.4,11236.0
1,37 King St,1.0,1,Manhattan,"[Intercom, Elevator, Renovated, Dishwasher, Fe...",0.06,Hudson Square,4500,0,106056.0,33.4,10013.0
2,Hudson St,2.0,2,Manhattan,[],0.18,Hudson Square,9395,0,106056.0,33.4,10013.0
3,600 West 42nd Street,1.0,1,Manhattan,"[Doorman, Fitness Center, Floorplans Available...",0.38,Hell's Kitchen,4300,0,93707.0,19.8,10036.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Featured,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Featured,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Featured,...",0.21,Hell's Kitchen,3199,700,93707.0,19.8,10036.0
7,312 W 58th Street,2.0,4,Manhattan,"[Laundry In Building, No Fee, Featured, Laundr...",0.08,Hell's Kitchen,7150,1,93707.0,19.8,10036.0
8,321 W 47th St,2.0,2,Manhattan,"[Exposed Brick, Dishwasher, No Fee, Featured, ...",0.17,Hell's Kitchen,3500,0,93707.0,19.8,10036.0
9,524 West 50th Street,1.0,Studio,Manhattan,"[Featured, Laundry In Building, Pre War Laundr...",0.41,Hell's Kitchen,1999,500,93707.0,19.8,10036.0


In [23]:
type(main_df.extra[0])
main_df.extra[0]

['Dishwasher',
 'Exclusive',
 'Featured',
 'Dogs Allowed',
 'Microwave',
 'Cats Allowed',
 'Guarantors Accepted']

In [24]:
#This function removes items from the amenities list that are not supposed to be there
#The function takes in one of items to remove and deletes it from each row's amenities' list

def remove_item_from_extra(item_to_remove):
    for i, row in main_df.iterrows():
        if item_to_remove in main_df['extra'][i]:
            main_df['extra'][i].remove(item_to_remove)

In [25]:
#This code creates the list of items to remove and runs a for loop through the list and the dataframe to remove them
item_remove = ['Featured', 'Light', 'No Pets', 'Subway', 'Exclusive', 'Floorplans Available']

for item in item_remove:
    remove_item_from_extra(item)

In [26]:
main_df.extra[0]

['Dishwasher',
 'Dogs Allowed',
 'Microwave',
 'Cats Allowed',
 'Guarantors Accepted']

In [27]:
#The following code will replace elements of the extras column lists if necessary
#First the element to be removed is removed, then the replacement value is appended

def replace_elements_in_extra_column(item_replace, replace_value):
    for i, row in main_df.iterrows():
        if item_replace in main_df['extra'][i]:
            main_df['extra'][i].remove(item_replace)
            main_df['extra'][i].append(replace_value)

In [28]:
#This block of code creates an amenities list that contains the amenity and frequency in descending order.
#I repeat this code a few times for reference
all_amenities = []

for row in list(main_df.extra):
    all_amenities +=row

count_amenity = Counter(list(all_amenities))
count_amenity.most_common()

[('Cats Allowed', 4204),
 ('Elevator', 4064),
 ('No Fee', 4059),
 ('Dogs Allowed', 3899),
 ('Laundry In Building', 3819),
 ('Doorman', 3249),
 ('Dishwasher', 2626),
 ('Hardwood Floors', 2538),
 ('Fitness Center', 2529),
 ('Renovated', 2251),
 ('Common Outdoor Space', 2167),
 ('High Ceilings', 1967),
 ('Laundry In Unit', 1868),
 ('Granite Kitchen', 1796),
 ('Marble Bath', 1766),
 ('Garage', 1206),
 ('Valet', 1042),
 ('Pre War', 1037),
 ('Eat In Kitchen', 1029),
 ('Walk In Closet', 1022),
 ('Deck', 936),
 ('Lounge', 922),
 ('Concierge', 909),
 ('Storage Facility', 817),
 ('High Speed Internet', 779),
 ('Stainless Steel Appliances', 757),
 ('Outdoor Space', 742),
 ('Bicycle Room', 717),
 ('Dining Room', 683),
 ('Storage', 596),
 ('Diplomats Ok', 560),
 ('Receiving Room', 551),
 ('Microwave', 535),
 ('Private Outdoor Space', 502),
 ('New Construction', 492),
 ('City View', 467),
 ('Balcony', 415),
 ('Live In Super', 413),
 ('Business Center', 401),
 ('Swimming Pool', 391),
 ('Open View', 3

In [29]:
#This for loop makes the amenities uniform. There are 1364 unique amenities, however several were entered
#differently or incorrectly by the users of the RentHop site
#This is a list of lists where the first entry is the item to be replaced and the second is the replacement value
item_replace = [['Pets   Dogs Ok', 'Dogs Allowed'],['Fitness Center', 'Gym'], ['Washer/Dryer', 'Laundry In Unit'], ['Garage', 'Parking'], ['Pre War Building','Pre War']]

for item in item_replace:
    replace_elements_in_extra_column(item[0], item[1])

In [30]:
main_df.extra[3]

['Doorman',
 'Heat/Hot Water Included',
 'Elevator',
 'Dishwasher',
 'By Owner',
 'Dogs Allowed',
 'Cats Allowed',
 'Gym']

In [31]:
import matplotlib as plt
from matplotlib import style
import seaborn as sns

ax = sns.scatterplot(x= 'bath', y='rent', data = main_df, color = 'green')
#this scatterplot chart showing the price over time

In [32]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6538 entries, 0 to 6656
Data columns (total 12 columns):
address                6473 non-null object
bath                   6538 non-null float64
bedrooms               6538 non-null object
borough                6538 non-null object
extra                  6538 non-null object
nearest_subway         6538 non-null float64
neighborhood           6538 non-null object
rent                   6538 non-null int64
sqft                   6538 non-null int64
median_income          6538 non-null float64
percent_high_earner    6538 non-null float64
zipcode                6538 non-null float64
dtypes: float64(5), int64(2), object(5)
memory usage: 984.0+ KB


In [33]:
#This block of code first creates a list of all amenities (with repeated vales)
#Then a new list is created with a counter applied
#Finally a new list is created without the counter so that I can view the most popular items to create columns

all_amenities_unsort = []

for row in list(main_df.extra):
    all_amenities_unsort += row

count_amenity = Counter(list(all_amenities_unsort))
all_amenities_count = count_amenity.most_common()

all_amenities = []

for row in list(all_amenities_count):
    all_amenities.append(row[0])
    
all_amenities


['Cats Allowed',
 'Elevator',
 'No Fee',
 'Dogs Allowed',
 'Laundry In Building',
 'Doorman',
 'Gym',
 'Dishwasher',
 'Hardwood Floors',
 'Renovated',
 'Common Outdoor Space',
 'High Ceilings',
 'Laundry In Unit',
 'Granite Kitchen',
 'Marble Bath',
 'Parking',
 'Pre War',
 'Valet',
 'Eat In Kitchen',
 'Walk In Closet',
 'Deck',
 'Lounge',
 'Concierge',
 'Storage Facility',
 'High Speed Internet',
 'Stainless Steel Appliances',
 'Outdoor Space',
 'Bicycle Room',
 'Dining Room',
 'Storage',
 'Diplomats Ok',
 'Receiving Room',
 'Microwave',
 'Private Outdoor Space',
 'New Construction',
 'City View',
 'Balcony',
 'Live In Super',
 'Business Center',
 'Swimming Pool',
 'Open View',
 'Wheelchair Access',
 'Reduced Fee',
 'Convertible',
 'Terrace',
 'Parking Space',
 'One Month Free',
 'Nursery',
 'Open Kitchen',
 'Garden/Patio',
 'Brownstone',
 'Loft',
 'Private Deck',
 'Roof Deck',
 'By Owner',
 'Virtual Doorman',
 'River View',
 'Children Playroom',
 'Laundry Room',
 'Furnished',
 'Firep

In [34]:
#This function creates a column with row values of zero given the column name
def create_column_zeroes(column_name):
    main_df[column_name] = pd.Series([0 for x in range(len(main_df.index))])

In [35]:
#This for loop create columns of the twenty five amenities

for amenity in all_amenities[0:25]:
    create_column_zeroes(amenity)


In [36]:
#This function changes the value of the amenities columns to a 1 if the entry list has that amenity
#Get dummies could not work here because the extras column contains a list

def return_boolean_value_amenity(amenity_string):
    main_df[amenity_string] = main_df.apply(lambda row: 1 if amenity_string in row['extra'] else 0, axis=1)

In [37]:
#Iterates the previous function through all of the twenty five column names
for amenity in all_amenities[0:25]:
    return_boolean_value_amenity(amenity)

In [38]:
#Tests to see if the above code works
main_df.loc[1]

address                                                        37 King St
bath                                                                    1
bedrooms                                                                1
borough                                                         Manhattan
extra                   [Intercom, Elevator, Renovated, Dishwasher, Ca...
nearest_subway                                                       0.06
neighborhood                                                Hudson Square
rent                                                                 4500
sqft                                                                    0
median_income                                                      106056
percent_high_earner                                                  33.4
zipcode                                                             10013
Cats Allowed                                                            1
Elevator                              

In [39]:
main_df.Dishwasher[0]

1

In [40]:
main_df['Cats Allowed'][0]

1

In [41]:
main_df['Dogs Allowed'][0]

1

In [42]:
#This block of code creates a dictionary of column names as the key and the column names with underscores as the
# value so that I can run an OLS regression later
cleaned_dummies = []
for names in list(main_df.columns)[12:]:
    cleaned_dummies.append(names.replace(' ', '_'))
    
new_column_dict = dict(zip(list(main_df.columns[12:]), cleaned_dummies))

In [43]:
#This line applies the code that renames the column to the main dataframe
main_df = main_df.rename(columns=new_column_dict)
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,...,Parking,Pre_War,Valet,Eat_In_Kitchen,Walk_In_Closet,Deck,Lounge,Concierge,Storage_Facility,High_Speed_Internet
0,453 Milford Street,1.0,3,Brooklyn,"[Dishwasher, Dogs Allowed, Microwave, Cats All...",0.50,New Lots,2200,0,63700.0,...,0,0,0,0,0,0,0,0,0,0
1,37 King St,1.0,1,Manhattan,"[Intercom, Elevator, Renovated, Dishwasher, Ca...",0.06,Hudson Square,4500,0,106056.0,...,0,1,0,0,0,0,0,0,0,0
2,Hudson St,2.0,2,Manhattan,[],0.18,Hudson Square,9395,0,106056.0,...,0,0,0,0,0,0,0,0,0,0
3,600 West 42nd Street,1.0,1,Manhattan,"[Doorman, Heat/Hot Water Included, Elevator, D...",0.38,Hell's Kitchen,4300,0,93707.0,...,0,0,0,0,0,0,0,0,0,0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,0,1,0,0,0,0,0,0,0,0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,0,1,0,0,0,0,0,0,0,0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,0,1,0,0,0,0,0,0,0,0
7,312 W 58th Street,2.0,4,Manhattan,"[Laundry In Building, No Fee, Laundry In Unit,...",0.08,Hell's Kitchen,7150,1,93707.0,...,0,0,0,0,0,0,0,0,0,0
8,321 W 47th St,2.0,2,Manhattan,"[Exposed Brick, Dishwasher, No Fee, Dogs Allow...",0.17,Hell's Kitchen,3500,0,93707.0,...,0,0,0,0,0,0,0,0,0,0
9,524 West 50th Street,1.0,Studio,Manhattan,"[Laundry In Building, Pre War Laundry On Every...",0.41,Hell's Kitchen,1999,500,93707.0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
#I review all the unique alues and entries in the bedrooms column and make the values uniform in the columns below
#This line of code lists the unique values in the bedrooms column
main_df.bedrooms.unique()

array(['3', '1', '2', '4', 'Studio', '3Flex 4 ', 'StudioFlex 1 ',
       ' Studio', '2Flex 3 ', '4Flex 5 ', '1Flex 2 ', '2 ', ' 1', '1 ',
       '5', 'Studio ', ' 2', 'Loft', '3 ', 'Room', 'RoomFlex 0 ',
       'LoftFlex -1 ', '7', '4 ', ' 3', '6', ' 2Flex 3 '], dtype=object)

In [45]:
#Counts the number of rooms where the bedroom row is 'Room'
#I drop these rows later because they are not uniformly one bedrooms

#main_df.bedrooms.loc[main_df['bedrooms'] == 'Room'].count()
df.loc[df['bedrooms'] == 'Room']
df.loc[2193].url

'https://www.renthop.com/listings/1722-amsterdam-ave/3r/14540686'

In [46]:
#This line creates a dummy column for Studio apartments
#This way the bedrooms column can be full of integers and the dummy column will differentiate between them

main_df['Studio'] = pd.Series([0 for x in range(len(main_df.index))])
main_df

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,...,Pre_War,Valet,Eat_In_Kitchen,Walk_In_Closet,Deck,Lounge,Concierge,Storage_Facility,High_Speed_Internet,Studio
0,453 Milford Street,1.0,3,Brooklyn,"[Dishwasher, Dogs Allowed, Microwave, Cats All...",0.50,New Lots,2200,0,63700.0,...,0,0,0,0,0,0,0,0,0,0.0
1,37 King St,1.0,1,Manhattan,"[Intercom, Elevator, Renovated, Dishwasher, Ca...",0.06,Hudson Square,4500,0,106056.0,...,1,0,0,0,0,0,0,0,0,0.0
2,Hudson St,2.0,2,Manhattan,[],0.18,Hudson Square,9395,0,106056.0,...,0,0,0,0,0,0,0,0,0,0.0
3,600 West 42nd Street,1.0,1,Manhattan,"[Doorman, Heat/Hot Water Included, Elevator, D...",0.38,Hell's Kitchen,4300,0,93707.0,...,0,0,0,0,0,0,0,0,0,0.0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,1,0,0,0,0,0,0,0,0,0.0
5,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,1,0,0,0,0,0,0,0,0,0.0
6,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,1,0,0,0,0,0,0,0,0,0.0
7,312 W 58th Street,2.0,4,Manhattan,"[Laundry In Building, No Fee, Laundry In Unit,...",0.08,Hell's Kitchen,7150,1,93707.0,...,0,0,0,0,0,0,0,0,0,0.0
8,321 W 47th St,2.0,2,Manhattan,"[Exposed Brick, Dishwasher, No Fee, Dogs Allow...",0.17,Hell's Kitchen,3500,0,93707.0,...,0,0,0,0,0,0,0,0,0,0.0
9,524 West 50th Street,1.0,Studio,Manhattan,"[Laundry In Building, Pre War Laundry On Every...",0.41,Hell's Kitchen,1999,500,93707.0,...,0,0,0,0,0,0,0,0,0,0.0


In [47]:
#I make all the values for bedrooms uniform here with numerics
#Flex partments count for their lower value in my analysis
#Studio apartments will become one-bedroom apartments with a studio value

main_df = main_df.replace({'bedrooms': '2Flex 3 '}, {'bedrooms':  2})
main_df = main_df.replace({'bedrooms': '4Flex 5 '}, {'bedrooms':  4})
main_df = main_df.replace({'bedrooms': '1Flex 2 '}, {'bedrooms': 1})
main_df = main_df.replace({'bedrooms': 'Loft'}, {'bedrooms': 1})
main_df = main_df.replace({'bedrooms': 'LoftFlex -1 '}, {'bedrooms': 1})
main_df = main_df.replace({'bedrooms': ' 2Flex 3 '}, {'bedrooms': 2})
main_df = main_df.replace({'bedrooms': '3'}, {'bedrooms': 3})
main_df = main_df.replace({'bedrooms': '1'}, {'bedrooms': 1})
main_df = main_df.replace({'bedrooms': '2'}, {'bedrooms': 2})
main_df = main_df.replace({'bedrooms': '4'}, {'bedrooms': 4})
main_df = main_df.replace({'bedrooms': '2 '}, {'bedrooms': 2})
main_df = main_df.replace({'bedrooms': ' 1'}, {'bedrooms': 1})
main_df = main_df.replace({'bedrooms': '1 '}, {'bedrooms':  2})
main_df = main_df.replace({'bedrooms': '5'}, {'bedrooms': 5})
main_df = main_df.replace({'bedrooms': ' 2'}, {'bedrooms': 2})
main_df = main_df.replace({'bedrooms': '3 '}, {'bedrooms': 3})
main_df = main_df.replace({'bedrooms': '7'}, {'bedrooms': 7})
main_df = main_df.replace({'bedrooms': '4 '}, {'bedrooms': 4})
main_df = main_df.replace({'bedrooms': ' 3'}, {'bedrooms':  3})
main_df = main_df.replace({'bedrooms': '6'}, {'bedrooms': 6})
main_df = main_df.replace({'bedrooms': 'StudioFlex 1 '}, {'bedrooms': 'Studio'})
main_df = main_df.replace({'bedrooms': ' Studio'}, {'bedrooms': 'Studio'})
main_df = main_df.replace({'bedrooms': 'Studio '}, {'bedrooms': 'Studio'})
main_df = main_df.replace({'bedrooms': '3Flex 4 '}, {'bedrooms': 3})

In [48]:
main_df.bedrooms.unique()

array([3, 1, 2, 4, 'Studio', 5, 'Room', 'RoomFlex 0 ', 7, 6], dtype=object)

In [49]:
#I remove the invalid rows here
main_df = main_df[main_df.bedrooms != 'Room']
main_df = main_df[main_df.bedrooms != 'RoomFlex 0 ']
main_df.bedrooms.unique()

array([3, 1, 2, 4, 'Studio', 5, 7, 6], dtype=object)

In [50]:
#Now that the dummies column was created I change the values in the Studio column to a one or zero
#main_df = main_df.replace({'bedrooms': 'Studio'}, {'Studio': 1})
main_df['Studio'][main_df.bedrooms == 'Studio'] = 1

In [51]:
main_df.loc[main_df['Studio'] == 1]

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,...,Pre_War,Valet,Eat_In_Kitchen,Walk_In_Closet,Deck,Lounge,Concierge,Storage_Facility,High_Speed_Internet,Studio
9,524 West 50th Street,1.0,Studio,Manhattan,"[Laundry In Building, Pre War Laundry On Every...",0.41,Hell's Kitchen,1999,500,93707.0,...,0,0,0,0,0,0,0,0,0,1.0
33,445 West 36th Street,1.0,Studio,Manhattan,[],0.25,Hell's Kitchen,1950,0,93707.0,...,0,0,0,0,0,0,0,0,0,1.0
38,445 West 36th Street,1.0,Studio,Manhattan,[],0.25,Hell's Kitchen,1950,0,93707.0,...,0,0,0,0,0,0,0,0,0,1.0
51,West 48th Street,1.0,Studio,Manhattan,"[Laundry In Building, Doorman, Storage Facilit...",0.44,Hell's Kitchen,2677,0,93707.0,...,0,0,0,0,0,0,0,1,0,1.0
66,West 57th Street,1.0,Studio,Manhattan,"[Laundry In Building, High Ceilings, Doorman, ...",0.27,Hell's Kitchen,2500,0,93707.0,...,1,0,0,0,0,0,0,0,0,1.0
67,West 57th Street,1.0,Studio,Manhattan,"[View, Doorman, On Site Laundry, Bike Room, El...",0.56,Hell's Kitchen,3044,0,93707.0,...,0,0,0,0,0,0,0,0,0,1.0
68,West 38th Street,1.0,Studio,Manhattan,"[Granite Kitchen, Deck, Doorman, High Ceilings...",0.24,Hell's Kitchen,3400,0,93707.0,...,0,0,1,0,1,0,0,0,0,1.0
73,West 42nd Street,1.0,Studio,Manhattan,"[Valet, Swimming Pool, Laundry In Unit, Balcon...",0.42,Hell's Kitchen,2950,0,93707.0,...,0,1,0,0,1,1,1,0,0,1.0
74,605 West 42nd Street,1.0,Studio,Manhattan,"[Storage Facility, Doorman, Elevator, No Fee, ...",0.42,Hell's Kitchen,3335,0,93707.0,...,0,0,0,0,0,0,0,1,0,1.0
98,W 57th St,1.0,Studio,Manhattan,"[Lounge, Roof Deck, Verizon Fios Enabled, Pack...",0.54,Hell's Kitchen,3044,0,93707.0,...,0,0,0,0,0,1,0,0,0,1.0


In [52]:
#Now that the Studio dummy column has a one or zero I can replace the bedroom values that equal 'Studio' to equal 1
main_df = main_df.replace({'bedrooms': 'Studio'}, {'bedrooms': 1})
main_df.bedrooms.unique()

array([3, 1, 2, 4, 5, 7, 6])

In [53]:
#I now clean the Borough column
main_df.borough.unique()

array(['Brooklyn', 'Manhattan', 'Manhattan ', 'Queens', 'Gramercy',
       'Midtown West', 'Brooklyn ', 'Queens ', 'Bronx'], dtype=object)

In [54]:
main_df = main_df[main_df.borough != 'Gramercy']
main_df = main_df[main_df.borough != 'Midtown West']
main_df = main_df[main_df.borough != 'Jersey City']
main_df = main_df[main_df.borough != 'Staten Island']
main_df = main_df[main_df.borough != 'Town']
main_df = main_df[main_df.borough != 'Clarion']
main_df = main_df[main_df.borough != 'Orange County']
main_df = main_df[main_df.borough != 'Madison']

main_df = main_df.replace({'borough': 'Manhattan '}, {'borough': 'Manhattan'})
main_df = main_df.replace({'borough': 'Brooklyn '}, {'borough': 'Brooklyn'})
main_df = main_df.replace({'borough': 'Queens '}, {'borough': 'Queens'})

main_df.borough.unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Bronx'], dtype=object)

In [55]:
all_boroughs = []

for row in list(main_df.borough):
    all_boroughs.append(row)

count_borough = Counter(list(all_boroughs))
count_borough.most_common()

[('Manhattan', 4903), ('Brooklyn', 1026), ('Queens', 554), ('Bronx', 31)]

In [56]:
#I choose to drop the Bronx from my dataset becuase I do not believe that I have enough datapoints to fairly evaluate them

main_df = main_df[main_df.borough != 'Bronx']
main_df.borough.value_counts()

Manhattan    4903
Brooklyn     1026
Queens        554
Name: borough, dtype: int64

In [57]:
#I create a dummies category for the borough
dummy = pd.get_dummies(main_df['borough'])
dummy.head()

Unnamed: 0,Brooklyn,Manhattan,Queens
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [58]:
#I concat the two dataframes
main_df = pd.concat([main_df, dummy], axis = 1)

In [59]:
#This block of code creates dummies for the neighborhood of the postings

dummy = pd.get_dummies(main_df['neighborhood'])
dummy.head()

Unnamed: 0,Alphabet City,Arverne,Astoria,Battery Park City,Bay Ridge,Bedford-Stuyvesant,Bensonhurst,Beverley Square East,Boerum Hill,Borough Park,...,Upper West Side,Vinegar Hill,Washington Heights,Weeksville,West Village,Williamsburg,Windsor Terrace,Wingate,Woodside,Yorkville
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
#I concat the two dataframes
main_df = pd.concat([main_df, dummy], axis = 1)
main_df.head()

Unnamed: 0,address,bath,bedrooms,borough,extra,nearest_subway,neighborhood,rent,sqft,median_income,...,Upper West Side,Vinegar Hill,Washington Heights,Weeksville,West Village,Williamsburg,Windsor Terrace,Wingate,Woodside,Yorkville
0,453 Milford Street,1.0,3,Brooklyn,"[Dishwasher, Dogs Allowed, Microwave, Cats All...",0.5,New Lots,2200,0,63700.0,...,0,0,0,0,0,0,0,0,0,0
1,37 King St,1.0,1,Manhattan,"[Intercom, Elevator, Renovated, Dishwasher, Ca...",0.06,Hudson Square,4500,0,106056.0,...,0,0,0,0,0,0,0,0,0,0
2,Hudson St,2.0,2,Manhattan,[],0.18,Hudson Square,9395,0,106056.0,...,0,0,0,0,0,0,0,0,0,0
3,600 West 42nd Street,1.0,1,Manhattan,"[Doorman, Heat/Hot Water Included, Elevator, D...",0.38,Hell's Kitchen,4300,0,93707.0,...,0,0,0,0,0,0,0,0,0,0
4,785 Ninth Avenue 3c,1.0,2,Manhattan,"[Brand New Renovations, Reduced Fee, Pre War, ...",0.21,Hell's Kitchen,3199,700,93707.0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
list(main_df.columns)

['address',
 'bath',
 'bedrooms',
 'borough',
 'extra',
 'nearest_subway',
 'neighborhood',
 'rent',
 'sqft',
 'median_income',
 'percent_high_earner',
 'zipcode',
 'Cats_Allowed',
 'Elevator',
 'No_Fee',
 'Dogs_Allowed',
 'Laundry_In_Building',
 'Doorman',
 'Gym',
 'Dishwasher',
 'Hardwood_Floors',
 'Renovated',
 'Common_Outdoor_Space',
 'High_Ceilings',
 'Laundry_In_Unit',
 'Granite_Kitchen',
 'Marble_Bath',
 'Parking',
 'Pre_War',
 'Valet',
 'Eat_In_Kitchen',
 'Walk_In_Closet',
 'Deck',
 'Lounge',
 'Concierge',
 'Storage_Facility',
 'High_Speed_Internet',
 'Studio',
 'Brooklyn',
 'Manhattan',
 'Queens',
 'Alphabet City',
 'Arverne',
 'Astoria',
 'Battery Park City',
 'Bay Ridge',
 'Bedford-Stuyvesant',
 'Bensonhurst',
 'Beverley Square East',
 'Boerum Hill',
 'Borough Park',
 'Bowery',
 'Briarwood',
 'Brooklyn Heights',
 'Bushwick',
 'Carnegie Hill',
 'Carroll Gardens',
 'Central Harlem',
 'Central Park',
 'Central Slope',
 'Chelsea',
 'Chinatown',
 'Civic Center',
 'Clinton Hill',


In [62]:
main_df['East Williamsburg'].value_counts()

0    6421
1      62
Name: East Williamsburg, dtype: int64

In [None]:
main_df.to_csv("neighfinal_listing_dataframe.csv", index=False)