In [3]:
library(ggplot2)
library(ggmap)
library(rjson)
library(class)


In [5]:
train <- fromJSON(file="data/train.json")

df <- data.frame(do.call("cbind", train))

In [13]:
df_coords <- data.frame(
    listing_id=unlist(df$listing_id),
    lat=unlist(df$latitude), 
    long=unlist(df$longitude), 
    ID=c(1:49352))

In [7]:
m_neighborhoods <- c("Chelsea", "Washington Heights", "Harlem", 
                   "East Harlem", "Upper West Side", 
                   "Upper East Side", "Midtown West", "Midtown East",
                   "Greenwich Village",
                   "Lower East Side", "Murray Hill",
                   "Stuyvesant Town", "Upper Manhattan", "Hell's Kitchen", 
                   "East Village", "SoHo", "Financial District", "Gramercy",
                   "Garment District", "Morningside Heights", "Tribeca",
                   "Chinatown", "Times Square")

b_neighborhoods <- c("Bay Ridge", "Sunset Park", "Bensonhurst", "Sheepshead Bay",
                     "Borough Park", "Midwood", "Flatbush", "East Flatbush", 
                     "Park Slope", "East New York", "Bedford-Stuyvesant", 
                     "Williamsburg", "Greenpoint", "Red Hook", "Downtown Brooklyn", 
                     "DUMBO", "Brownsville", "Prospect Park", "Fort Hamilton", 
                     "Cypress Hills", "Bushwick", "Canarsie", "Brooklyn Heights",
                     "Cobble Hill")

q_neighborhoods <- c("Astoria", "Long Island City", "Steinway", "Ridgewood", "Woodside", 
                     "Elmhurst", "Jackson Heights", "Corona", "Murray Hill", "Flushing", 
                     "Kew Gardens", "Fresh Meadows", "Jamaica", "Bayside", "Whitestone")

s_neighborhoods <- c("West New Brighton", "Mariners Harbor")


bx_neighborhoods <- c("West Bronx", "Yankee Stadium")

nj_neighborhoods <- c("Newark")


getCoords <- function(neighborhoods){  
  num_n <- length(neighborhoods)
  if (neighborhoods[1]=="Newark"){
    neighborhoods <- paste0(neighborhoods, ", NJ")
  } else {
    neighborhoods <- paste0(neighborhoods, ", NY")
  }
  
  lat <- rep(0, num_n)
  lon <- rep(0, num_n)
  
  for(i in 1:num_n){
    n <- neighborhoods[i]
    reply <- suppressMessages(geocode(n)) # You may want to expand on this to get status
    lat[i] <- reply$lat
    lon[i] <- reply$lon
  }
  
  return(data.frame(n=neighborhoods, lat=lat, lon=lon))
}

In [8]:
X <- do.call("rbind", list(getCoords(m_neighborhoods), getCoords(b_neighborhoods), 
                           getCoords(q_neighborhoods), getCoords(s_neighborhoods),
                           getCoords(bx_neighborhoods), getCoords(nj_neighborhoods)))
  
neighborhoods <- knn(X[, c("lat", "lon")], df_coords[, c(1,2)], X$n, k = 1)

In [39]:
write.csv(X, 'dfs/neighborhoods.csv')

In [14]:
df_coords['neighborhoods'] = neighborhoods

In [17]:
write.csv(df_coords, 'dfs/train_neighborhoods.csv')

In [20]:
test <- fromJSON(file="data/test.json")

df <- data.frame(do.call("cbind", test))

In [29]:
df_coords <- data.frame(
    listing_id=unlist(df$listing_id),
    lat=unlist(df$latitude), 
    long=unlist(df$longitude), 
    ID=c(1:74659))

In [28]:
nrow(df_coords)

In [34]:
X <- do.call("rbind", list(getCoords(m_neighborhoods), getCoords(b_neighborhoods), 
                           getCoords(q_neighborhoods), getCoords(s_neighborhoods),
                           getCoords(bx_neighborhoods), getCoords(nj_neighborhoods)))
  
neighborhoods <- knn(X[, c("lat", "lon")], df_coords[, c(1,2)], X$n, k = 1)

In [35]:
head(neighborhoods)

In [36]:
df_coords['neighborhoods'] = neighborhoods

In [37]:
write.csv(df_coords, 'dfs/test_neighborhoods.csv')