In [1]:
from pymongo import MongoClient
import pandas as pd
import pymongo
from pandas.io.json import json_normalize
import re
import numpy as np
import requests


First query using mongo

<img src="../images/mongoquery.png">

In [2]:
#Connecting the database with the queried data (companies_cb)

def mongo_connect(host):             #mongodb://localhost:27017/
    client = MongoClient(host)
    db = client.DBcompanies_cb
    data = db.companies_cb
    return data


In [3]:
data = mongo_connect('mongodb://localhost:27017/')

In [4]:
#Query to receive all the required data for my analysis. (I know we should query everything at the same place (pymongo or mongodb compass), but was interested in trying both to learn.).

def mongo_query(data, min_employee, max_employee):
    one_office =  data.find({'$and': [
        {'offices': {'$exists': True}},
        {'offices': {'$ne': None}},
        #     {'number_of_employees':{'$exists':True}}
        {'number_of_employees': {'$gte': min_employee}},        #10
        {'number_of_employees': {'$lte': max_employee}},        #51
        {'offices.latitude': {'$ne': None}},
        {'offices.longitude': {'$ne': None}},
        {'offices.latitude': {'$exists': True}},
        {'offices.longitude': {'$exists': True}}

    ]})

    return pd.DataFrame(one_office)


In [5]:
one_office = mongo_query(data, 10, 51)

In [6]:
#Merge deadpool related columns into 1 and fill blanks with NaN values.
def deadpooled_finder(df):
    df['deadpooled'] = df[df.columns[10:13]].apply(lambda x: ','.join(x.dropna().astype(str)),
                                                                           axis=1).replace(r'^\s*$', np.nan, regex=True)
    return df['deadpooled']


In [7]:
one_office['deadpooled'] = deadpooled_finder(one_office)

In [8]:
#Select alive companies. If they have 'deadpoled' data I understand they are dead. 
def alives_finder(df):
    return df[pd.isnull(df['deadpooled'])]


In [9]:
one_office = alives_finder(one_office)

In [10]:
def columns_drop(df, col):
    return df[[x for x in df.columns if x != col]]

# Beaware when pipelines


In [11]:
one_office.columns

Index(['_id', 'acquisition', 'acquisitions', 'alias_list', 'blog_feed_url',
       'blog_url', 'category_code', 'competitions', 'created_at',
       'crunchbase_url', 'deadpooled_day', 'deadpooled_month',
       'deadpooled_url', 'deadpooled_year', 'description', 'email_address',
       'external_links', 'founded_day', 'founded_month', 'founded_year',
       'funding_rounds', 'homepage_url', 'image', 'investments', 'ipo',
       'milestones', 'name', 'number_of_employees', 'offices', 'overview',
       'partners', 'permalink', 'phone_number', 'products', 'providerships',
       'relationships', 'screenshots', 'tag_list', 'total_money_raised',
       'twitter_username', 'updated_at', 'video_embeds', 'deadpooled'],
      dtype='object')

In [12]:
def relevant_columns(df):
        return pd.DataFrame(df[['name', 'category_code', 'number_of_employees', 'offices','total_money_raised']])

    


In [13]:
# data = pd.DataFrame(one_office[['name', 'category_code', 'founded_year', 'number_of_employees', 'offices','total_money_raised', 'deadpooled']])
data = relevant_columns(one_office)


In [14]:
#Dropping columns we no longer need.

# data = columns_drop(data, 'deadpooled')
# data = columns_drop(data, 'founded_year')


In [15]:
#Finding out the different types of currencies we can find along the dataset.

currencies_types=[]
for i in data['total_money_raised']:
    if i[0:2] not in currencies_types:
        currencies_types.append(i[0:2])



In [16]:
#Converting symbols into string values for future uses.
def currency_converter(df):
    currency_type = {'C$': 'CAD',
                '$': 'USD',
                '€': 'EUR',
                '£': 'GBP',
                '¥': 'JPY',
                'kr': 'SEK'}
    for symb, name in currency_type.items():
        if symb in df:
#             a = df.replace(symb,'')
            return name

data['currency'] = data['total_money_raised'].apply(currency_converter)



In [17]:
#Deleting currency symbols.

def symbol_deleter(df):
    currency_type = {'C$': 'CAD',
                '$': 'USD',
                '€': 'EUR',
                '£': 'GBP',
                '¥': 'JPY',
                'kr': 'SEK'}
    for symb, name in currency_type.items():
        if symb in df:
            return df.replace(symb, "")

data['total_money_raised'] = data['total_money_raised'].apply(symbol_deleter)

In [18]:
#Converting "total_money_raised" into integers.
def money_converter(df):
    amount_type = dict(k='E3', M='E6', B='E9')
    return pd.to_numeric(df.replace(amount_type, regex=True)).astype(float)
    


In [19]:
data['amount_raised'] = money_converter(data['total_money_raised'])

In [20]:
#Create a dictionary with the needed exchange rates using an API to obtain real data.
def api_rates(url):
    response = requests.get(url)
    api_data = response.json()
    api_dataframe = pd.DataFrame(json_normalize(api_data))
    api_dict = {
            'CAD':api_dataframe['rates.CAD'][0],
            'EUR':api_dataframe['rates.EUR'][0],
            'GBP':api_dataframe['rates.GBP'][0],
            'JPY':api_dataframe['rates.JPY'][0],
            'SEK':api_dataframe['rates.SEK'][0],
            'USD':1
            }
    return api_dict

In [21]:
api_dict = api_rates('https://api.exchangerate-api.com/v4/latest/USD')

In [22]:
#Set the currency valuation for every company.

def currency_rate(df):
    return pd.to_numeric(df.replace(api_dict, regex=True))

data['currency'] = currency_rate(data['currency'])

In [23]:
#Standarize all valuations into one currency ($) and convert them into millions.

def normalizator(df):
    return ((df['amount_raised']/df['currency'])/1000).round(2)

data['amount_raised_k$'] = normalizator(data)

In [24]:
#Dropping more columns

data = columns_drop(data, 'total_money_raised')
data = columns_drop(data, 'currency')
data = columns_drop(data, 'amount_raised')

In [25]:
#There are some companies which have >1 offices. Separate them into different rows. 

def office_splitter(df):
    office_split = pd.DataFrame(df['offices'].tolist()).stack().reset_index(level=1, drop=True).rename('office')
    return df.merge(office_split, left_index=True, right_index=True).reset_index()
office_merged = office_splitter(data)

In [26]:
#Dropping column offices since now we have it splitted into different rows already.

office_merged = columns_drop(office_merged, 'offices')

In [27]:
#Check if there are duplicated values into the 'office' column

find_duplicated = json_normalize(office_merged['office'])
find_duplicated.duplicated().sum()


100

In [28]:
office_merged.head()

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_k$,office
0,0,Wetpaint,web,47,39800.0,"{'description': '', 'address1': '710 - 2nd Ave..."
1,0,Wetpaint,web,47,39800.0,"{'description': '', 'address1': '270 Lafayette..."
2,1,Geni,web,18,16500.0,"{'description': 'Headquarters', 'address1': '9..."
3,2,Scribd,news,50,25800.0,"{'description': 'HQ', 'address1': '539 Bryant ..."
4,3,MeetMoi,social,15,5580.0,"{'description': None, 'address1': '', 'address..."


In [29]:
# Deleting duplicates

def duplicates_remover(df):
    df['duplicates'] = df['office'].astype(str)
    return df.drop_duplicates('duplicates', keep = 'first')

office_merged = duplicates_remover(office_merged)
office_merged = columns_drop(office_merged, 'duplicates')



In [30]:
find_duplicated1 = json_normalize(office_merged['office'])
find_duplicated1.duplicated().sum()


0

In [31]:
#I assume companies who have raised more money will pay higher income to their employees. But do not forget the number of employees is important.
def wealthy(df):
    wealth = pd.DataFrame((np.log(df['amount_raised_k$']).astype(str).replace('-inf','1').astype(float)*df['number_of_employees']))
    divisor = wealth.max()
    return wealth/divisor


In [32]:
office_merged['wealth'] = wealthy(office_merged)

  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
office_merged.sort_values('wealth', ascending = False).head(10)

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_k$,office,wealth
1226,1209,Green Apple Media,web,50,132000.0,"{'description': 'Emporis GmbH', 'address1': 'A...",1.0
237,259,CLEAR,travel,50,116000.0,"{'description': None, 'address1': 'PO Box 4831...",0.989041
1230,1213,CouponCabin,web,50,54000.0,"{'description': 'HQ', 'address1': '80 Airport ...",0.924192
380,405,Aster Data Systems,analytics,50,53000.0,"{'description': None, 'address1': '1021 E. 7th...",0.922607
1260,1240,Mantara,software,50,50600.0,"{'description': 'ClerkDogs', 'address1': '', '...",0.918676
28,32,Socialtext,enterprise,50,46800.0,"{'description': 'Corporate Headquarters', 'add...",0.912055
977,983,Movidius,semiconductor,50,46500.0,"{'description': 'Headquarters', 'address1': '1...",0.91151
26,30,ooma,hardware,47,81300.0,"{'description': None, 'address1': '855 Oak Gro...",0.901361
1164,1149,Apprenda,enterprise,51,31000.0,"{'description': 'Headquarters', 'address1': '3...",0.894663
1165,1149,Apprenda,enterprise,51,31000.0,"{'description': 'Regional Office', 'address1':...",0.894663


In [34]:
#Analysing data. Which kind of companies do we have:

a = pd.get_dummies(office_merged['category_code'])
a.sum().sort_values(ascending = False)

software            396
web                 299
advertising         144
other               105
mobile              104
enterprise           93
consulting           90
games_video          87
ecommerce            76
public_relations     66
network_hosting      59
search               57
hardware             40
analytics            17
security             11
biotech              10
social                8
messaging             8
education             7
music                 6
finance               6
news                  6
cleantech             6
real_estate           5
health                3
travel                3
photo_video           2
sports                2
medical               1
fashion               1
nanotech              1
semiconductor         1
dtype: int64

In [35]:
def hot_encoder(df, category):
    lst = []
    for i in df:
        if i == category:
            lst.append(1)
        else:
            lst.append(0)
    return lst

office_merged['news_agencies'] = hot_encoder(office_merged['category_code'], 'news')

In [36]:
#Function to convert the info within offices into a geopoint.

def geopoint(data):
    data = data['office']
    principal = None
    if data['latitude'] and data['longitude']:                   #Make sure there is data
        principal = {
            "type":"Point",
            "coordinates":[data['longitude'], data['latitude']]
        }

    return {
        "lat": data['latitude'],
        "lng": data['longitude'],
        "geopoint": principal
    }

geopoint = office_merged.apply(geopoint, result_type="expand", axis=1).dropna()


In [37]:
#Concatenating data with geopoints
def concatenator(df1, df2):
    return pd.concat([df1, df2], axis=1)


In [38]:
offices_geo = concatenator(office_merged, geopoint)

In [39]:
offices_geo.head()

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_k$,office,wealth,news_agencies,geopoint,lat,lng
0,0,Wetpaint,web,47,39800.0,"{'description': '', 'address1': '710 - 2nd Ave...",0.844415,0,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253
1,0,Wetpaint,web,47,39800.0,"{'description': '', 'address1': '270 Lafayette...",0.844415,0,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.723731,-73.996431
2,1,Geni,web,18,16500.0,"{'description': 'Headquarters', 'address1': '9...",0.296509,0,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064
3,2,Scribd,news,50,25800.0,"{'description': 'HQ', 'address1': '539 Bryant ...",0.861548,1,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052
4,3,MeetMoi,social,15,5580.0,"{'description': None, 'address1': '', 'address...",0.219505,0,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506


In [40]:
#Dropping columns we no longer need

offices_geo = columns_drop(offices_geo, 'office')
offices_geo = columns_drop(offices_geo, 'index')

In [41]:
offices_geo.head()

Unnamed: 0,name,category_code,number_of_employees,amount_raised_k$,wealth,news_agencies,geopoint,lat,lng
0,Wetpaint,web,47,39800.0,0.844415,0,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253
1,Wetpaint,web,47,39800.0,0.844415,0,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.723731,-73.996431
2,Geni,web,18,16500.0,0.296509,0,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064
3,Scribd,news,50,25800.0,0.861548,1,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052
4,MeetMoi,social,15,5580.0,0.219505,0,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506


In [42]:
offices_geo.shape

(1726, 9)

In [45]:
def json_creator(df, name):
    return df.to_json(f'../data/{name}.json', orient="records")


In [46]:
json_creator(offices_geo, 'geoffices')

Create the new collection in mongodb compass (geo_offices in my case) and import the geoffices.json writting the following comand into the terminal:

**mongoimport --db DBcompanies_cb --collection companies_clean --file geoffices.json --jsonArray**

Now move into the indexes area inside mongodb compass and create an index selecting the 'geopoint' column and 2dsphere. The result should look like this:

![geoindex](../images/index.png)