In [1]:
from pymongo import MongoClient
import pandas as pd
import pymongo
from pandas.io.json import json_normalize
import re
import numpy as np
import requests


First query using mongo

<img src="images/mongoquery.png" width="800" />

In [2]:
#Connecting the database with the queried data (data_companies_clean)

client = MongoClient('mongodb://localhost:27017/')
db = client.DBcompanies_cb
data = db.companies_cb

In [3]:
#Query using Pymongo to receive all the required data for my analysis. (I know we should query everything at the same place (pymongo or mongodb compass), but was interested in trying both to learn.).


one_office = db.companies_cb.find({'$and':[
    {'offices':{'$exists':True}},
    {'offices':{'$ne': None}},
#     {'number_of_employees':{'$exists':True}}
    {'number_of_employees':{'$gte': 10}},
    {'offices.latitude':{'$ne':None}},
    {'offices.longitude':{'$ne':None}},
    {'offices.latitude':{'$exists':True}},
    {'offices.longitude':{'$exists':True}}
    
]})


In [4]:
#Create first dataframe and show the columns we have.

one_office = pd.DataFrame(one_office)


In [6]:
#Merge deadpool related columns into 1 and fill blanks with NaN values.

one_office['deadpooled'] = one_office[one_office.columns[10:13]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1).replace(r'^\s*$', np.nan, regex=True)


In [7]:
#Select relevant columns for the project.

data = pd.DataFrame(one_office[['name', 'category_code', 'founded_year', 'number_of_employees', 'offices','total_money_raised', 'deadpooled']])


In [8]:
#Select alive companies. If they have 'deadpoled' data I understand they are dead. 

data = data[pd.isnull(data['deadpooled'])]


In [9]:
def columns_drop(df, col):
    return df[[x for x in df.columns if x != col]]

In [10]:
#Dropping columns we no longer need.

data = columns_drop(data, 'deadpooled')
data = columns_drop(data, 'founded_year')


In [11]:
#Finding out the different types of currencies we can find along the dataset.

currencies_types=[]
for i in data['total_money_raised']:
    if i[0:2] not in currencies_types:
        currencies_types.append(i[0:2])



In [12]:
#Converting symbols into string values for future uses.

currency_type = {'C$': 'CAD',
                '$': 'USD',
                '€': 'EUR',
                '£': 'GBP',
                '¥': 'JPY',
                'kr': 'SEK'}

def currency_converter(df):
    for symb, name in currency_type.items():
        if symb in df:
            a = df.replace(symb,'')
            return name

data['currency'] = data['total_money_raised'].apply(currency_converter)



In [13]:
#Deleting currency symbols.

def symbol_deleter(df):
    for symb, name in currency_type.items():
        if symb in df:
            return df.replace(symb, "")

data['total_money_raised'] = data['total_money_raised'].apply(symbol_deleter)

In [14]:
#Converting "total_money_raised" into integers.

amount_type = dict(k='E3', M='E6', B='E9')
data['amount_raised'] = pd.to_numeric(data['total_money_raised'].replace(amount_type, regex=True)).astype(float)



In [15]:
#Create a dictionary with the needed exchange rates using an API to obtain real data.
url = 'https://api.exchangerate-api.com/v4/latest/USD'

# Requesting data...

response = requests.get(url)
api_data = response.json()

api_dataframe = pd.DataFrame(json_normalize(api_data))
api_dict = {'CAD':api_dataframe['rates.CAD'][0],
            'EUR':api_dataframe['rates.EUR'][0],
            'GBP':api_dataframe['rates.GBP'][0],
            'JPY':api_dataframe['rates.JPY'][0],
            'SEK':api_dataframe['rates.SEK'][0],
            'USD':1
    }


In [16]:
#Set the currency valuation for every company.

def currency_rate(df):
    return pd.to_numeric(df.replace(api_dict, regex=True))

data['currency'] = currency_rate(data['currency'])

In [17]:
#Standarize all valuations into one currency ($) and convert them into millions.

def normalizator(df):
    return ((df['amount_raised']/df['currency'])/1000).round(2)

data['amount_raised_k$'] = normalizator(data)

In [18]:
#Dropping more columns

data = columns_drop(data, 'total_money_raised')
data = columns_drop(data, 'currency')
data = columns_drop(data, 'amount_raised')

In [19]:
#Inspect info provided inside 'offices' column

c = pd.DataFrame(json_normalize(data['offices'][4]))
c

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0
1,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,
2,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0


In [20]:
#There are some companies which have >1 offices. Separate them into different rows. 

data2 = data.copy()
office_split = pd.DataFrame(data2['offices'].tolist()).stack().reset_index(level=1, drop=True).rename('office')
office_merged = data2.merge(office_split, left_index=True, right_index=True).reset_index()


In [21]:
#Checking the values are correct and belong to each company

d = pd.DataFrame(json_normalize(office_merged['offices'][5]))
d

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0
1,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,
2,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0


In [22]:
#Dropping column offices since now we have it splitted into different rows already.

office_merged = columns_drop(office_merged, 'offices')

In [23]:
#Check if there are duplicated values into the 'office' column

find_duplicated = json_normalize(office_merged['office'])
find_duplicated.duplicated().sum()


145

In [24]:
# Deleting duplicates

def duplicates_remover(df):
    office_merged['duplicates'] = df.astype(str)
    return office_merged.drop_duplicates('duplicates', keep = 'first')

office_merged = duplicates_remover(office_merged['office'])
office_merged = columns_drop(office_merged, 'duplicates')



In [26]:
find_duplicated1 = json_normalize(office_merged['office'])
find_duplicated1.duplicated().sum()


0

In [27]:
#I assume companies who have raised more money will pay higher income to their employees. But do not forget the number of employees is important.

office_merged['wealth'] = pd.DataFrame((np.log(office_merged['amount_raised_k$']).astype(str).replace('-inf','1').astype(float)*office_merged['number_of_employees']))


  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
#Converting into numbers we can understand. Max value = 1, min value = close to 0 = 2.73...e-06

divisor = office_merged['wealth'].max()
office_merged['wealth'] = office_merged['wealth']/divisor

In [29]:
office_merged.sort_values('wealth', ascending = False).head(10)

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_k$,office,wealth
102,96,PayPal,finance,300000,197000.0,"{'description': '', 'address1': '181 Fremont S...",1.0
564,552,Comcast,games_video,100000,725000.0,"{'description': '', 'address1': 'Tyrconnell Pa...",0.36896
2796,2542,General Dynamics,transportation,92900,84900.0,"{'description': '', 'address1': '2434 Lincoln ...",0.288286
980,933,MetaCarta,mobile,99999,10000.0,"{'description': 'Main Office', 'address1': '86...",0.251833
891,861,Xerox,hardware,57400,1100000.0,"{'description': 'Main Office', 'address1': 'Tr...",0.218326
14,11,Cisco,network_hosting,63000,2500.0,"{'description': 'Headquarters', 'address1': '1...",0.134776
454,448,IBM,software,388000,0.0,"{'description': 'HQ', 'address1': '149 New Mon...",0.10609
95,89,Sony,hardware,180500,0.0,"{'description': '', 'address1': '1919 S Bascom...",0.049354
439,432,LG,hardware,177000,0.0,"{'description': '', 'address1': '450 National ...",0.048397
764,748,VMware,software,13500,369000.0,"{'description': None, 'address1': '215 First S...",0.047317


In [30]:
#Function to convert the info within offices into a geopoint.

def geopoint(data):
    data = data['office']
    principal = None
    if data['latitude'] and data['longitude']:                   #Make sure there is data
        principal = {
            "type":"Point",
            "coordinates":[data['longitude'], data['latitude']]
        }

    return {
        "lat": data['latitude'],
        "lng": data['longitude'],
        "geopoint": principal
    }

geopoint = office_merged.apply(geopoint, result_type="expand", axis=1).dropna()


In [31]:
#Concatenating data with geopoints

offices_geo = pd.concat([office_merged, geopoint], axis=1)


In [32]:
#Dropping columns we no longer need

offices_geo = columns_drop(offices_geo, 'office')
offices_geo = columns_drop(offices_geo, 'index')

In [33]:
offices_geo.head()

Unnamed: 0,name,category_code,number_of_employees,amount_raised_k$,wealth,geopoint,lat,lng
0,Wetpaint,web,47,39800.0,0.000136,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253
1,Wetpaint,web,47,39800.0,0.000136,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.723731,-73.996431
2,AdventNet,enterprise,600,0.0,0.000164,"{'type': 'Point', 'coordinates': [-121.904945,...",37.692934,-121.904945
3,Zoho,software,1600,0.0,0.000437,"{'type': 'Point', 'coordinates': [-121.904945,...",37.692934,-121.904945
4,Digg,news,60,45000.0,0.000176,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523


In [39]:
offices_geo.to_json('./data/geoffices.json', orient="records")


Create the new collection in mongodb compass (geo_offices in my case) and import the geoffices.json writting the following comand into the terminal:

**mongoimport --db DBcompanies_cb --collection geo_offices --file geoffices.json --jsonArray**

Now move into the indexes area inside mongodb compass and create an index selecting the 'geopoint' column and 2dsphere. The result should look like this:

<img src="images/index.png" width='800' />