In [5]:
import sys
sys.path.append("/anaconda/lib/python2.7/site-packages")

import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
%matplotlib inline

#  CALCULATING TOTAL TAXES FOR 2015-16 FISCAL YEAR

In [1]:
# we have most jsons, so we can just look at their json files
wanted_jsons = ["2015-04-01", "2015-05-01", "2015-06-01", "2015-08-01", "2015-09-01", "2015-10-01", "2015-11-01",\
                "2015-12-02", "2016-01-01", "2016-02-02", "2016-04-03", "2016-05-02", "2016-06-02"]

# two files are missing, so we'll take the means of the months around them
missing_months_dict = {}
missing_months_dict["2015-07-01"] = ("2015-06-01", "2015-08-01")
missing_months_dict["2016-03-01"] = ("2016-02-02", "2016-04-03")

In [33]:
# for the data we do have, sum over categories for city and state
city_taxes_by_date = {}
state_taxes_by_date = {}
for wanted in wanted_jsons:
    df = pd.read_json("data/json_files_by_date/" + wanted + ".json")
    city_taxes_by_date[wanted] = sum(df["tax_NYC_sales"]) + sum(df["tax_hotel_occupancy"])
    state_taxes_by_date[wanted] = sum(df["tax_javits"]) + sum(df["tax_mctd"]) + sum(df["tax_state_sales"])

In [34]:
# for missing files, get the values around them
for date in missing_months_dict.keys():
    city_taxes_by_date[date] = 0.5*(city_taxes_by_date[missing_months_dict[date][0]] + city_taxes_by_date[missing_months_dict[date][1]])
    state_taxes_by_date[date] = 0.5*(state_taxes_by_date[missing_months_dict[date][0]] + state_taxes_by_date[missing_months_dict[date][1]])
    
# write results to csv
with open("hi.csv", "w") as f:
    f.write("date, city, state\n")
    for date in sorted(city_taxes_by_date.keys()):
        f.write(date+ "," +str(city_taxes_by_date[date])+ "," + str(state_taxes_by_date[date]) + "\n")
f.close()

In [6]:
dfiles = [x for x in os.listdir("data/json_files_by_date/") if "-" in x]

In [48]:
thing = pd.read_json("data/json_files_by_date/2015-03-01.json")

In [50]:
thing.columns

Index([u'accommodates', u'bathrooms', u'bedrooms', u'borough',
       u'calculated_host_listings_count', u'collected', u'date', u'host_id',
       u'host_location', u'illegal', u'latitude', u'longitude', u'min_stay',
       u'neighborhood', u'num_reviews', u'price', u'property_type', u'rating',
       u'room_id', u'room_type', u'source', u'tax_NYC_sales',
       u'tax_hotel_occupancy', u'tax_javits', u'tax_mctd', u'tax_state_sales',
       u'tax_total'],
      dtype='object')

In [53]:
thing["room_type"].unique()

array([u'Entire home/apt', u'Private room', u'Shared room'], dtype=object)

In [63]:
len(thing[thing['illegal'] == thing['illegal']])/float(len(thing))

1.0

In [35]:
cat_variables = ["illegal","room_type"]
cont_variables = ["price"]

all_var_values = {}
for var in cat_variables:
    all_var_values[var] = set()


neighborhoods = set()
boroughs = set()

In [33]:
for i in pd.read_json("data/json_files_by_date/2015-01-01.json")["neighborhood"].unique():
    neighborhoods.add(i)

In [36]:
for fname in dfiles:
    df = pd.read_json("data/json_files_by_date/" + fname)
    for var in cat_variables:
        for item in df[var].unique():
            all_var_values[var].add(item)
   
    for i in df["neighborhood"].unique():
        neighborhoods.add(i)
        
    for i in df["borough"].unique():
        boroughs.add(i)
    
    

In [38]:
boroughs

{u'Bronx', u'Brooklyn', u'Manhattan', u'Queens', u'Staten Island'}

In [18]:
for var in all_var_values.keys():
    for idx,item in enumerate(sorted(all_var_values[var])):
        print idx, item

0 0
1 1
0 Entire home/apt
1 Private room
2 Shared room


In [24]:
counts_by_date = {}
for fname in dfiles:
    df = pd.read_json("data/json_files_by_date/" + fname)
    
    new_dict = {}
    for var in cat_variables:
        for idx,item in enumerate(sorted(all_var_values[var])):
            new_dict[var+str(idx)] = len(df[df[var] == item])

    new_dict["total"] = len(df)
    new_dict["price0"]  = len(df[df["price"] < 50])
    new_dict["price1"] = len(df[df["price"] >= 50][df["price"] < 100])
    new_dict["price2"] = len(df[df["price"] >= 100][df["price"] < 150])
    new_dict["price3"] = len(df[df["price"] >= 150][df["price"] < 200])
    new_dict["price4"] = len(df[df["price"] >= 200][df["price"] < 300])
    new_dict["price5"] = len(df[df["price"] >= 300])
    
    date = fname.split(".")[0]
    new_dict["date"] = date
    counts_by_date[date] = new_dict



In [25]:
counts_by_date

{'2015-01-01': {'date': '2015-01-01',
  'illegal0': 15227,
  'illegal1': 24326,
  'price0': 3160,
  'price1': 13923,
  'price2': 8609,
  'price3': 6240,
  'price4': 4745,
  'price5': 2876,
  'room_type0': 20306,
  'room_type1': 17885,
  'room_type2': 1362,
  'total': 39553},
 '2015-03-01': {'date': '2015-03-01',
  'illegal0': 8863,
  'illegal1': 18238,
  'price0': 1224,
  'price1': 8307,
  'price2': 6174,
  'price3': 5108,
  'price4': 3941,
  'price5': 2347,
  'room_type0': 15496,
  'room_type1': 10792,
  'room_type2': 813,
  'total': 27101},
 '2015-04-01': {'date': '2015-04-01',
  'illegal0': 9075,
  'illegal1': 18336,
  'price0': 1237,
  'price1': 8270,
  'price2': 6156,
  'price3': 5303,
  'price4': 4043,
  'price5': 2402,
  'room_type0': 15551,
  'room_type1': 11007,
  'room_type2': 853,
  'total': 27411},
 '2015-05-01': {'date': '2015-05-01',
  'illegal0': 9168,
  'illegal1': 18151,
  'price0': 1063,
  'price1': 8236,
  'price2': 6095,
  'price3': 5325,
  'price4': 4173,
  'price5

In [26]:
import json
with open('category_counts.json', 'w') as fp:
    json.dump(counts_by_date, fp)

In [150]:
counts_by_date['2015-01-01']

{'illegal': {0: 15227, 1: 24326},
 'price': {0: 3160, 1: 13923, 2: 8609, 3: 6240, 4: 4745, 5: 2876},
 'room_type': {u'Entire home/apt': 20306,
  u'Private room': 17885,
  u'Shared room': 1362},
 'total': 39553}

In [41]:
all_nhoods_dict = {}
for nhood in neighborhoods:
    all_nhoods_dict[nhood] = {}

In [45]:
for fname in dfiles:
    df = pd.read_json("data/json_files_by_date/" + fname)
    for nhood in neighborhoods:
        subdf = df[df["neighborhood"] == nhood]

        new_dict = {}
        for var in cat_variables:
            for idx,item in enumerate(sorted(all_var_values[var])):
                new_dict[var+str(idx)] = len(subdf[subdf[var] == item])

        new_dict["total"] = len(subdf)
        new_dict["price0"]  = len(subdf[subdf["price"] < 50])
        new_dict["price1"] = len(subdf[subdf["price"] >= 50][subdf["price"] < 100])
        new_dict["price2"] = len(subdf[subdf["price"] >= 100][subdf["price"] < 150])
        new_dict["price3"] = len(subdf[subdf["price"] >= 150][subdf["price"] < 200])
        new_dict["price4"] = len(subdf[subdf["price"] >= 200][subdf["price"] < 300])
        new_dict["price5"] = len(subdf[subdf["price"] >= 300])

        date = fname.split(".")[0]
        new_dict["date"] = date
        all_nhoods_dict[nhood][date] = new_dict
    print fname



2015-01-01.json
2015-03-01.json
2015-04-01.json
2015-05-01.json
2015-06-01.json
2015-08-01.json
2015-09-01.json
2015-10-01.json
2015-11-01.json
2015-11-20.json
2015-12-02.json
2016-01-01.json
2016-02-02.json
2016-04-03.json
2016-05-02.json
2016-06-02.json
2016-07-02.json
2016-10-01.json


In [48]:
import json
with open('data/neighborhood_category_counts.json', 'w') as fp:
    json.dump(all_nhoods_dict, fp)

In [49]:
all_boroughs_dict = {}
for borough in boroughs:
    all_boroughs_dict[borough] = {}
    
for fname in dfiles:
    df = pd.read_json("data/json_files_by_date/" + fname)
    for borough in boroughs:
        subdf = df[df["borough"] == borough]

        new_dict = {}
        for var in cat_variables:
            for idx,item in enumerate(sorted(all_var_values[var])):
                new_dict[var+str(idx)] = len(subdf[subdf[var] == item])

        new_dict["total"] = len(subdf)
        new_dict["price0"]  = len(subdf[subdf["price"] < 50])
        new_dict["price1"] = len(subdf[subdf["price"] >= 50][subdf["price"] < 100])
        new_dict["price2"] = len(subdf[subdf["price"] >= 100][subdf["price"] < 150])
        new_dict["price3"] = len(subdf[subdf["price"] >= 150][subdf["price"] < 200])
        new_dict["price4"] = len(subdf[subdf["price"] >= 200][subdf["price"] < 300])
        new_dict["price5"] = len(subdf[subdf["price"] >= 300])

        date = fname.split(".")[0]
        new_dict["date"] = date
        all_boroughs_dict[borough][date] = new_dict
    print fname



2015-01-01.json
2015-03-01.json
2015-04-01.json
2015-05-01.json
2015-06-01.json
2015-08-01.json
2015-09-01.json
2015-10-01.json
2015-11-01.json
2015-11-20.json
2015-12-02.json
2016-01-01.json
2016-02-02.json
2016-04-03.json
2016-05-02.json
2016-06-02.json
2016-07-02.json
2016-10-01.json


In [51]:
import json
with open('data/borough_category_counts.json', 'w') as fp:
    json.dump(all_boroughs_dict, fp)