In [90]:
from pymongo import MongoClient
import pandas as pd
import pymongo
import folium
from folium import plugins
from folium.plugins import MiniMap
from folium.plugins import FastMarkerCluster
import numpy as np
import re
import requests as req
import json
from dotenv import load_dotenv
import os

In [91]:
#Connecting the database with the queried data (geo_offices)

def geomongo_connect(host):             
    client = MongoClient(host)
    db = client.DBcompanies_cb
    data = db.companies_clean.find()
    return pd.DataFrame(data)



In [92]:
df_comp = geomongo_connect('mongodb://localhost:27017/')

In [93]:
display(df_comp.head())
print(df_comp.shape)
print(df_comp.index)

Unnamed: 0,_id,amount_raised_k$,category_code,geopoint,lat,lng,name,news_agencies,number_of_employees,wealth
0,5d62ddee087b7af24faa3648,16500.0,web,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064,Geni,0,18,0.296509
1,5d62ddee087b7af24faa3649,25800.0,news,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052,Scribd,1,50,0.861548
2,5d62ddee087b7af24faa364a,39800.0,web,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.723731,-73.996431,Wetpaint,0,47,0.844415
3,5d62ddee087b7af24faa364b,39800.0,web,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253,Wetpaint,0,47,0.844415
4,5d62ddee087b7af24faa364c,5580.0,social,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506,MeetMoi,0,15,0.219505


(1726, 10)
RangeIndex(start=0, stop=1726, step=1)


In [94]:
def reorder(df):
    return df[['name', 'lat', 'lng', 'geopoint', 'number_of_employees','amount_raised_k$','category_code', 'wealth', 'news_agencies']]


In [95]:
df_comp = reorder(df_comp)

In [96]:
#Ordering the columns and dropping column id.

# data_comp = df_comp[['name', 'lat', 'lng', 'geopoint', 'number_of_employees','amount_raised_k$','category_code', 'wealth', 'news_agencies']]
df_comp.head()


Unnamed: 0,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies
0,Geni,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",18,16500.0,web,0.296509,0
1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,1
2,Wetpaint,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",47,39800.0,web,0.844415,0
3,Wetpaint,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",47,39800.0,web,0.844415,0
4,MeetMoi,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",15,5580.0,social,0.219505,0


In [97]:
#Querying near companies
def nearComps(host, df, rad_max_meters=1000):
    client = MongoClient(host)
    db = client.DBcompanies_cb
    lst=[]
    for i in range(len(df)): 
        near=db.companies_clean.find({'$and':[{
                        'geopoint': {
                            '$near': {
                            '$geometry': df[i],
                            '$maxDistance': rad_max_meters
                            }
                        }
                    }]})
        data=pd.DataFrame(near)
        lst.append(data.shape[0])
    return lst

df_comp['offices_near']=nearComps('mongodb://localhost:27017/', df_comp['geopoint'])

In [98]:
#Querying near news_agencies

def nearNews(host, df, rad_max_meters=1000):
    client = MongoClient(host)
    db = client.DBcompanies_cb
    lst=[]
    for i in range(len(df)): 
        near=db.companies_clean.find({'$and':[{
                        "geopoint": {
                            "$near": {
                            "$geometry": df[i],
                            "$maxDistance": rad_max_meters
                            }
                        }},{
                            'news_agencies': 1
                        }]})
                    
        data=pd.DataFrame(near)
        lst.append(data.shape[0])
    return lst

df_comp['news_agencies']=nearNews('mongodb://localhost:27017/', df_comp["geopoint"])


In [99]:
#Normalizing near offices and near news_agencies (I weighted it to an 80%/1 value)

def normalizator (df):
    return df/df.max()

df_comp['offices_near'] = normalizator(df_comp['offices_near'])
df_comp['news_agencies'] = normalizator(df_comp['news_agencies'])*(-0.1)

In [100]:
df_comp.head(20)

Unnamed: 0,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near
0,Geni,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",18,16500.0,web,0.296509,-0.0,0.019608
1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,-0.05,0.72549
2,Wetpaint,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",47,39800.0,web,0.844415,-0.05,0.254902
3,Wetpaint,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",47,39800.0,web,0.844415,-0.0,0.117647
4,MeetMoi,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",15,5580.0,social,0.219505,-0.0,0.529412
5,Plaxo,37.387845,-122.055197,"{'type': 'Point', 'coordinates': [-122.055197,...",50,28300.0,web,0.869392,-0.0,0.039216
6,Technorati,37.779558,-122.393041,"{'type': 'Point', 'coordinates': [-122.393041,...",35,32100.0,advertising,0.616055,-0.0,0.490196
7,Mahalo,34.017606,-118.487267,"{'type': 'Point', 'coordinates': [-118.487267,...",40,21000.0,web,0.675271,-0.0,0.137255
8,Kyte,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",40,23400.0,games_video,0.682613,-0.05,0.54902
9,Jingle Networks,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",35,88700.0,mobile,0.676398,-0.0,0.019608


In [101]:
#Summing up wealth, news_agencies and offices_near to obtain a final score.

def final_score(df, col1, col2, col3):
    return df[[col1, col2, col3]].sum(axis = 1)

df_comp['final_score'] = final_score(df_comp, 'wealth', 'news_agencies', 'offices_near')

In [102]:
len(df_comp)

1726

In [103]:
#Select top 500 to test the api query. Then if it is not very expensive, we will try with the whole dataset.
def top500(df):
    return df_comp.sort_values('final_score', ascending = False).reset_index().head(500)



top500 = top500(df_comp)


In [104]:
# top500 = top500[[x for x in top500.columns if x != 'index']]
top500.head()

Unnamed: 0,index,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,final_score
0,881,Wishabi,40.744549,-73.988071,"{'type': 'Point', 'coordinates': [-73.988071, ...",50,16000.0,ecommerce,0.821025,-0.0,0.764706,1.585731
1,1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,-0.05,0.72549,1.537038
2,50,Echo,37.786942,-122.401245,"{'type': 'Point', 'coordinates': [-122.401245,...",50,4800.0,enterprise,0.718912,-0.05,0.843137,1.512049
3,1119,ReachForce,40.752539,-73.987871,"{'type': 'Point', 'coordinates': [-73.987871, ...",40,14500.0,enterprise,0.650141,-0.0,0.823529,1.47367
4,448,Pivot,37.787646,-122.402759,"{'type': 'Point', 'coordinates': [-122.402759,...",40,13100.0,software,0.643252,-0.05,0.803922,1.397173


In [45]:
#Cluster map
latlng = top500[['lat', 'lng']]
m2 = folium.Map(zoom_start=15)
FastMarkerCluster(latlng).add_to(m2)
m2

In [51]:
#Function to query Google's API and find info about other bars, bus and metro stations. 

load_dotenv()

if not 'KEY' in os.environ:
    raise ValueError('This function requires a Google KEY in orer to work.')

API_KEY = os.environ["KEY"]
BASE_URL = 'https://maps.googleapis.com/maps/api/place/nearbysearch'

def near_API(BASE_URL, df, obj, rad):
    lst=[] 
    for i in range(len(df)):
        lat=df['lat'][i]
        lng=df['lng'][i]
        res = req.get('{}/json?location={},{}&radius={}&type={}&key={}'.format(BASE_URL,lat,lng,rad,obj,API_KEY))
#         https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=-33.8670522,151.1957362&radius=1500&type=restaurant&keyword=cruise&key=YOUR_API_KEY
        near=res.json()
#         print(near)
        if near['status']=='OK': 
            lst.append(len(near['results']))
        else: 
            lst.append(0)
    return lst


In [52]:
top500['bar']=near_API(BASE_URL, top500,'bar', 1000)
top500['bus']=near_API(BASE_URL, top500,'bus_station', 500)
top500['subway_station']=near_API(BASE_URL, top500,'subway_station', 1000)

In [53]:
#Saving the dataframe into a csv to avoid using the API again
def csv_creator(df, name):
    return df.to_csv(f'../data/{name}.csv', index=False)

In [54]:
csv_creator(top500, 'top500')

In [55]:
#Importing the csv

def csv_reader(path):
    return pd.read_csv(path)

In [56]:
df = csv_reader('../data/top500.csv')

In [57]:
df

Unnamed: 0,index,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,final_score,bar,bus,subway_station
0,881,Wishabi,40.744549,-73.988071,"{'type': 'Point', 'coordinates': [-73.988071, ...",50,16000.00,ecommerce,0.821025,-0.0,0.764706,1.585731,20,13,13
1,1119,ReachForce,40.752539,-73.987871,"{'type': 'Point', 'coordinates': [-73.987871, ...",40,14500.00,enterprise,0.650141,-0.0,0.823529,1.473670,20,7,16
2,1428,SpaBooker,37.778991,-122.401803,"{'type': 'Point', 'coordinates': [-122.401803,...",42,14700.00,software,0.683624,-0.0,0.666667,1.350291,20,5,1
3,209,Replay Solutions,40.752143,-73.990675,"{'type': 'Point', 'coordinates': [-73.990675, ...",30,15200.00,software,0.490005,-0.0,0.745098,1.235103,20,7,16
4,1327,BASH Gaming,40.748280,-73.989276,"{'type': 'Point', 'coordinates': [-73.989276, ...",25,1000.00,games_video,0.292936,-0.0,0.901961,1.194897,20,8,18
5,1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.00,news,0.861548,-0.4,0.725490,1.187038,20,17,3
6,50,Echo,37.786942,-122.401245,"{'type': 'Point', 'coordinates': [-122.401245,...",50,4800.00,enterprise,0.718912,-0.4,0.843137,1.162049,20,20,3
7,6,Technorati,37.779558,-122.393041,"{'type': 'Point', 'coordinates': [-122.393041,...",35,32100.00,advertising,0.616055,-0.0,0.490196,1.106251,20,4,0
8,1278,Questra,37.798318,-122.400003,"{'type': 'Point', 'coordinates': [-122.4000032...",45,12500.00,software,0.720079,-0.0,0.372549,1.092628,20,5,2
9,420,Crowd Science,40.745064,-73.992637,"{'type': 'Point', 'coordinates': [-73.992637, ...",25,2000.00,web,0.322330,-0.0,0.764706,1.087036,20,10,16


In [105]:
#Normalizing competition, bus and subway.

df['bar'] = normalizator(df['bar'])*(-0.2)
df['bus'] = normalizator(df['bus'])*(0.5)
df['subway_station'] = normalizator(df['subway_station'])*(0.5)


In [106]:
#Affecting very negatively if there is no transport availability.

def no_transport(df):
    lst = []
    for i in df:
        if i < 0.000000000000001:
             lst.append(-0.5)
        else:
            lst.append(i)
    return lst
        

In [107]:
df['bus'] = no_transport(df['bus'])
df['subway_station'] = no_transport(df['subway_station'])

In [108]:
#Obtaining final score

df['score'] = final_score(df, 'bar', 'bus', 'subway_station')

In [109]:
#Obtaining final score

df['score'] = df[['score', 'final_score']].sum(axis = 1).round(2)

In [110]:
df.head()

Unnamed: 0,index,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,final_score,bar,bus,subway_station,score
0,881,Wishabi,40.744549,-73.988071,"{'type': 'Point', 'coordinates': [-73.988071, ...",50,16000.0,ecommerce,0.821025,-0.0,0.764706,1.585731,-0.2,0.325,0.325,2.04
1,1119,ReachForce,40.752539,-73.987871,"{'type': 'Point', 'coordinates': [-73.987871, ...",40,14500.0,enterprise,0.650141,-0.0,0.823529,1.47367,-0.2,0.175,0.4,1.85
2,1428,SpaBooker,37.778991,-122.401803,"{'type': 'Point', 'coordinates': [-122.401803,...",42,14700.0,software,0.683624,-0.0,0.666667,1.350291,-0.2,0.125,0.025,1.3
3,209,Replay Solutions,40.752143,-73.990675,"{'type': 'Point', 'coordinates': [-73.990675, ...",30,15200.0,software,0.490005,-0.0,0.745098,1.235103,-0.2,0.175,0.4,1.61
4,1327,BASH Gaming,40.74828,-73.989276,"{'type': 'Point', 'coordinates': [-73.989276, ...",25,1000.0,games_video,0.292936,-0.0,0.901961,1.194897,-0.2,0.2,0.45,1.64


In [111]:
df_final = df[[x for x in df.columns if x != 'final_score']]

In [112]:
df_final = df_final.sort_values('score', ascending = False)

In [113]:
top50 =  df_final.head(50)
top10 = df_final.head(10)

In [114]:
#Cluster map top50
latlng = top50[['lat', 'lng']]
m50 = folium.Map(zoom_start=15)
FastMarkerCluster(latlng).add_to(m50)
m50

In [115]:
#Cluster map top10
latlng = top10[['lat', 'lng']]
m10 = folium.Map(zoom_start=15)
FastMarkerCluster(latlng).add_to(m10)
m10

In [116]:
top10

Unnamed: 0,index,name,lat,lng,geopoint,number_of_employees,amount_raised_k$,category_code,wealth,news_agencies,offices_near,bar,bus,subway_station,score
0,881,Wishabi,40.744549,-73.988071,"{'type': 'Point', 'coordinates': [-73.988071, ...",50,16000.0,ecommerce,0.821025,-0.0,0.764706,-0.2,0.325,0.325,2.04
1,1119,ReachForce,40.752539,-73.987871,"{'type': 'Point', 'coordinates': [-73.987871, ...",40,14500.0,enterprise,0.650141,-0.0,0.823529,-0.2,0.175,0.4,1.85
4,1327,BASH Gaming,40.74828,-73.989276,"{'type': 'Point', 'coordinates': [-73.989276, ...",25,1000.0,games_video,0.292936,-0.0,0.901961,-0.2,0.2,0.45,1.64
3,209,Replay Solutions,40.752143,-73.990675,"{'type': 'Point', 'coordinates': [-73.990675, ...",30,15200.0,software,0.490005,-0.0,0.745098,-0.2,0.175,0.4,1.61
11,1408,GoHome,40.750131,-73.981242,"{'type': 'Point', 'coordinates': [-73.9812417,...",19,942.97,search,0.220739,-0.0,0.843137,-0.2,0.375,0.325,1.56
9,420,Crowd Science,40.745064,-73.992637,"{'type': 'Point', 'coordinates': [-73.992637, ...",25,2000.0,web,0.32233,-0.0,0.764706,-0.2,0.25,0.4,1.54
6,50,Echo,37.786942,-122.401245,"{'type': 'Point', 'coordinates': [-122.401245,...",50,4800.0,enterprise,0.718912,-0.4,0.843137,-0.2,0.5,0.075,1.54
12,853,kooaba,40.744201,-73.989505,"{'type': 'Point', 'coordinates': [-73.989505, ...",20,2920.0,mobile,0.270703,-0.0,0.784314,-0.2,0.325,0.35,1.53
5,1,Scribd,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,...",50,25800.0,news,0.861548,-0.4,0.72549,-0.2,0.425,0.075,1.49
13,1299,Conformiq,40.743836,-73.989015,"{'type': 'Point', 'coordinates': [-73.9890149,...",20,4200.0,software,0.283035,-0.0,0.764706,-0.2,0.3,0.325,1.47
