In [1]:
from pymongo import MongoClient
import pandas as pd
import pymongo
import folium
from folium import plugins
from folium.plugins import MiniMap
from folium.plugins import FastMarkerCluster
import numpy as np
import re
import requests as req
import json
from dotenv import load_dotenv
import os

In [2]:
#Connecting the database with the queried data (geo_offices)

client = MongoClient('mongodb://localhost:27017/')
db = client.DBcompanies_cb
data = db.companies_clean.find()

In [3]:
#Creating the dataframe

df_comp = pd.DataFrame(data)
df_comp.head()
len(df_comp)

1726

In [4]:
print(df_comp.shape)
print(df_comp.index)

(1726, 10)
RangeIndex(start=0, stop=1726, step=1)


In [5]:
#Ordering the columns and dropping column id.

data_comp = df_comp[['name', 'lat', 'lng', 'geopoint', 'number_of_employees','amount_raised_k$','category_code', 'wealth', 'news_agencies']]
data_comp.head()


Unnamed: 0,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies
0,Geni,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",18,16500.0,web,0.296509,0
1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,1
2,Wetpaint,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",47,39800.0,web,0.844415,0
3,Wetpaint,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",47,39800.0,web,0.844415,0
4,MeetMoi,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",15,5580.0,social,0.219505,0


In [6]:
#Querying near companies

def nearComps(df, rad_max_meters=1000):
    lst=[]
    for i in range(len(df)): 
        near=db.companies_clean.find({'$and':[{
                        "geopoint": {
                            "$near": {
                            "$geometry": df[i],
                            "$maxDistance": rad_max_meters
                            }
                        }
                    }]})
        data=pd.DataFrame(near)
        lst.append(data.shape[0])
    return lst

data_comp['offices_near']=nearComps(data_comp["geopoint"])

In [7]:
#Querying near news_agencies

def nearNews(df, rad_max_meters=1000):
    lst=[]
    for i in range(len(df)): 
        near=db.companies_clean.find({'$and':[{
                        "geopoint": {
                            "$near": {
                            "$geometry": df[i],
                            "$maxDistance": rad_max_meters
                            }
                        }},{
                            'news_agencies': 1
                        }]})
                    
        data=pd.DataFrame(near)
        lst.append(data.shape[0])
    return lst

data_comp['news_agencies']=nearNews(data_comp["geopoint"])


In [8]:
#Normalizing near offices and near news_agencies (I weighted it to a 10% value)

def normalizator (df):
    return df/df.max()

data_comp['offices_near'] = normalizator(data_comp['offices_near'])
data_comp['news_agencies'] = normalizator(data_comp['news_agencies'])*(-0.1)

In [9]:
#Summing up wealth, news_agencies and offices_near to obtain a final score.

def final_score(df, col1, col2, col3):
    return df[[col1, col2, col3]].sum(axis = 1)

data_comp['final_score'] = final_score(data_comp, 'wealth', 'news_agencies', 'offices_near')

In [10]:
#Select top 10 to test the api query. Then if it is not very expensive, we will try with the whole dataset.

top500 = data_comp.sort_values('final_score', ascending = False).reset_index().head(500)


In [11]:
top500 = top500[[x for x in top500.columns if x != 'index']]
top500.head()


Unnamed: 0,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,final_score
0,Geni,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",18,16500.0,web,0.296509,-0.0,0.019608,0.316116
1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,-0.05,0.72549,1.537038
2,Wetpaint,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",47,39800.0,web,0.844415,-0.05,0.254902,1.049317
3,Wetpaint,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",47,39800.0,web,0.844415,-0.0,0.117647,0.962062
4,MeetMoi,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",15,5580.0,social,0.219505,-0.0,0.529412,0.748917


In [12]:
#Cluster map
latlng = top500[['lat', 'lng']]
m2 = folium.Map(zoom_start=15)
FastMarkerCluster(latlng).add_to(m2)
m2

In [14]:
# top500 = top1000.head(500)

In [15]:
# load_dotenv()

# if not 'KEY' in os.environ:
#     raise ValueError('This function requires a Google KEY in orer to work.')

# API_KEY = os.environ["KEY"]
# BASE_URL = 'https://maps.googleapis.com/maps/api/place/nearbysearch'

# def near_API(BASE_URL, df, obj, keyword, rad):
#     lst=[] 
#     for i in range(len(df)):
#         lat=df['lat'][i]
#         lng=df['lng'][i]
#         res = req.get('{}/json?location={},{}&radius={}&type={}&keyword={}&key={}'.format(BASE_URL,lat,lng,rad,obj,keyword,API_KEY))
# #         https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=-33.8670522,151.1957362&radius=1500&type=restaurant&keyword=cruise&key=YOUR_API_KEY
#         near=res.json()
# #         print(near)
#         if near['status']=='OK': 
#             lst.append(len(near['results']))
#         else: 
#             lst.append(0)
#     return lst

In [16]:
top500['bus']=near_API(BASE_URL, top500,'bus_station','bus_station', 500)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
top500['metro']=near_API(BASE_URL, top500,'subway_station','subway_station', 1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
#Function to query Google's API and find info about other bars, bus and metro stations. 

load_dotenv()

if not 'KEY' in os.environ:
    raise ValueError('This function requires a Google KEY in orer to work.')

API_KEY = os.environ["KEY"]
BASE_URL = 'https://maps.googleapis.com/maps/api/place/nearbysearch'

def near_API(BASE_URL, df, obj, rad):
    lst=[] 
    for i in range(len(df)):
        lat=df['lat'][i]
        lng=df['lng'][i]
        res = req.get('{}/json?location={},{}&radius={}&type={}&key={}'.format(BASE_URL,lat,lng,rad,obj,API_KEY))
#         https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=-33.8670522,151.1957362&radius=1500&type=restaurant&keyword=cruise&key=YOUR_API_KEY
        near=res.json()
#         print(near)
        if near['status']=='OK': 
            lst.append(len(near['results']))
        else: 
            lst.append(0)
    return lst


In [20]:
top500['bar']=near_API(BASE_URL, top500,'bar', 1000)
top500['bus']=near_API(BASE_URL, top500,'bus_station', 500)
top500['subway_station']=near_API(BASE_URL, top500,'subway_station', 1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
top500

Unnamed: 0,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,final_score,bar,bus,subway_station
0,Wishabi,40.744549,-73.988071,"{'type': 'Point', 'coordinates': [-73.988071, ...",50,16000.00,ecommerce,0.821025,-0.00,0.764706,1.585731,20,13,13
1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.00,news,0.861548,-0.05,0.725490,1.537038,20,17,3
2,Echo,37.786942,-122.401245,"{'type': 'Point', 'coordinates': [-122.401245,...",50,4800.00,enterprise,0.718912,-0.05,0.843137,1.512049,20,20,3
3,ReachForce,40.752539,-73.987871,"{'type': 'Point', 'coordinates': [-73.987871, ...",40,14500.00,enterprise,0.650141,-0.00,0.823529,1.473670,20,7,16
4,Pivot,37.787646,-122.402759,"{'type': 'Point', 'coordinates': [-122.402759,...",40,13100.00,software,0.643252,-0.05,0.803922,1.397173,20,20,3
5,CrowdFlower,37.795141,-122.401194,"{'type': 'Point', 'coordinates': [-122.401194,...",49,13200.00,enterprise,0.788615,-0.05,0.647059,1.385674,20,16,2
6,SpaBooker,37.778991,-122.401803,"{'type': 'Point', 'coordinates': [-122.401803,...",42,14700.00,software,0.683624,-0.00,0.666667,1.350291,20,5,1
7,Worklight,37.786906,-122.397672,"{'type': 'Point', 'coordinates': [-122.397672,...",30,17600.00,mobile,0.497465,-0.05,0.901961,1.349426,20,20,3
8,Replay Solutions,40.752143,-73.990675,"{'type': 'Point', 'coordinates': [-73.990675, ...",30,15200.00,software,0.490005,-0.00,0.745098,1.235103,20,7,16
9,Lat49,37.787076,-122.399412,"{'type': 'Point', 'coordinates': [-122.399412,...",24,3800.00,advertising,0.335567,-0.05,0.921569,1.207136,20,20,3


In [None]:
#Saving the dataframe into a csv to avoid using the API again
top50.to_csv('../data/top50.csv',index=False)