In [1]:
from pymongo import MongoClient
import pandas as pd
import pymongo
from pandas.io.json import json_normalize
import re
import numpy as np
import requests


In [2]:
# #Connecting the database with the queried data (data_companies_clean)
# client = MongoClient('mongodb://localhost:27017/')
# db = client.companies
# data = db.data_companies_clean


In [3]:
#Connecting the database with the queried data (data_companies_clean)
client = MongoClient('mongodb://localhost:27017/')
db = client.DBcompanies_cb
data = db.companies_cb

In [4]:
#Query to receive all the essential data for my analysis. 
one_office = db.companies_cb.find({'$and':[
    {'offices':{'$exists':True}},
    {'offices':{'$ne': None}},
#     {'number_of_employees':{'$exists':True}}
    {'number_of_employees':{'$gte': 10}}
]})


In [5]:
#Create first dataframe and show the columns we have
one_office = pd.DataFrame(one_office)

In [6]:
#Merge deadpool related columns into 1 and fill blanks with NaN values.
one_office['deadpooled'] = one_office[one_office.columns[10:13]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1).replace(r'^\s*$', np.nan, regex=True)


In [7]:
#Select relevant columns for the project
data = pd.DataFrame(one_office[['name', 'category_code', 'founded_year', 'number_of_employees', 'offices','total_money_raised', 'deadpooled']])


In [8]:
#Select alive companies. If they have 'deadpoled' data I understand they are dead. 
data = data[pd.isnull(data['deadpooled'])]


In [9]:
def columns_drop(df, col):
    return df[[x for x in df.columns if x != col]]

In [10]:
#Dropping columns we no longer need.
data = columns_drop(data, 'deadpooled')
data = columns_drop(data, 'founded_year')
data.head()

Unnamed: 0,name,category_code,number_of_employees,offices,total_money_raised
0,Wetpaint,web,47,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M
1,AdventNet,enterprise,600,"[{'description': 'Headquarters', 'address1': '...",$0
2,Zoho,software,1600,"[{'description': 'Headquarters', 'address1': '...",$0
3,Digg,news,60,"[{'description': None, 'address1': '135 Missis...",$45M
4,Facebook,social,5299,"[{'description': 'Headquarters', 'address1': '...",$2.43B


In [11]:
#Finding out the different types of currencies we can find along the dataset.
currencies_types=[]
for i in data['total_money_raised']:
    if i[0:2] not in currencies_types:
        currencies_types.append(i[0:2])

print(currencies_types)

['$3', '$0', '$4', '$2', '$1', '$6', '$5', '$7', '$8', '$9', '€1', '€3', 'C$', '€4', '£2', '€8', '€5', '€9', '€2', '£3', '£1', '£4', '€6', '€7', '£7', '¥2', 'kr', '¥4', '¥1', '£5', '£6', '£9']


In [12]:
#Converting symbols into string values for future uses
currency_type = {'C$': 'CAD',
                '$': 'USD',
                '€': 'EUR',
                '£': 'GBP',
                '¥': 'JPY',
                'kr': 'SEK'}

def currency_converter(df):
    for symb, name in currency_type.items():
        if symb in df:
            a = df.replace(symb,'')
            return name

data['currency'] = data['total_money_raised'].apply(currency_converter)



In [13]:
#Deleting currency symbols
def symbol_deleter(df):
    for symb, name in currency_type.items():
        if symb in df:
            return df.replace(symb, "")

data['total_money_raised'] = data['total_money_raised'].apply(symbol_deleter)

In [14]:
#Converting "total_money_raised" into integers
amount_type = dict(k='E3', M='E6', B='E9')
data['amount_raised'] = pd.to_numeric(data['total_money_raised'].replace(amount_type, regex=True)).astype(float)



In [15]:
#Create a dictionary with the needed exchange rates using an API to obtain real data.
url = 'https://api.exchangerate-api.com/v4/latest/USD'

# Making our request
response = requests.get(url)
api_data = response.json()

api_dataframe = pd.DataFrame(json_normalize(api_data))
api_dict = {'CAD':api_dataframe['rates.CAD'][0],
            'EUR':api_dataframe['rates.EUR'][0],
            'GBP':api_dataframe['rates.GBP'][0],
            'JPY':api_dataframe['rates.JPY'][0],
            'SEK':api_dataframe['rates.SEK'][0],
            'USD':1
    }
api_dict

{'CAD': 1.329868,
 'EUR': 0.901207,
 'GBP': 0.82387,
 'JPY': 106.45378,
 'SEK': 9.652291,
 'USD': 1}

In [16]:
#Set the currency valuation for every company
def currency_rate(df):
    return pd.to_numeric(df.replace(api_dict, regex=True))
data['currency'] = currency_rate(data['currency'])

In [17]:
#Standarize all valuations into one currency ($) and convert them into millions
def normalizator(df):
    return ((df['amount_raised']/df['currency'])/1000000).round(2)

data['amount_raised_M$'] = normalizator(data)

In [18]:
#Dropping more columns
data = columns_drop(data, 'total_money_raised')
data = columns_drop(data, 'currency')
data = columns_drop(data, 'amount_raised')

In [19]:
#Inspect info provided inside 'offices' column
c = pd.DataFrame(json_normalize(data['offices'][4]))
c

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0
1,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,
2,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0


In [20]:
#There are some companies which have >1 offices. Separate them into different rows. 
data2 = data.copy()
office_split = pd.DataFrame(data2['offices'].tolist()).stack().reset_index(level=1, drop=True).rename('office')
office_merged = data2.merge(office_split, left_index=True, right_index=True).reset_index()


In [21]:
#Checking the values are correct and belong to each company
d = pd.DataFrame(json_normalize(office_merged['offices'][5]))
d

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,1601 Willow Road,,Menlo Park,USA,Headquarters,37.41605,-122.151801,CA,94025.0
1,,,Dublin,IRL,Europe HQ,53.344104,-6.267494,,
2,340 Madison Ave,,New York,USA,New York,40.755716,-73.979247,NY,10017.0


In [22]:
print(len(data))
print(len(office_merged))

4406
5856


In [23]:
#Count the number of offices every company has
offices_number = office_merged['name'].groupby(office_merged['name'], sort=False).count()


In [24]:
office_merged.head(9)

Unnamed: 0,index,name,category_code,number_of_employees,offices,amount_raised_M$,office
0,0,Wetpaint,web,47,"[{'description': '', 'address1': '710 - 2nd Av...",39.8,"{'description': '', 'address1': '710 - 2nd Ave..."
1,0,Wetpaint,web,47,"[{'description': '', 'address1': '710 - 2nd Av...",39.8,"{'description': '', 'address1': '270 Lafayette..."
2,1,AdventNet,enterprise,600,"[{'description': 'Headquarters', 'address1': '...",0.0,"{'description': 'Headquarters', 'address1': '4..."
3,2,Zoho,software,1600,"[{'description': 'Headquarters', 'address1': '...",0.0,"{'description': 'Headquarters', 'address1': '4..."
4,3,Digg,news,60,"[{'description': None, 'address1': '135 Missis...",45.0,"{'description': None, 'address1': '135 Mississ..."
5,4,Facebook,social,5299,"[{'description': 'Headquarters', 'address1': '...",2430.0,"{'description': 'Headquarters', 'address1': '1..."
6,4,Facebook,social,5299,"[{'description': 'Headquarters', 'address1': '...",2430.0,"{'description': 'Europe HQ', 'address1': '', '..."
7,4,Facebook,social,5299,"[{'description': 'Headquarters', 'address1': '...",2430.0,"{'description': 'New York', 'address1': '340 M..."
8,5,Geni,web,18,"[{'description': 'Headquarters', 'address1': '...",16.5,"{'description': 'Headquarters', 'address1': '9..."


In [25]:
#Dropping column offices since now we have it splitted into different rows already.
office_merged = columns_drop(office_merged, 'offices')

In [26]:
j = json_normalize(office_merged['office'])
j.head()

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,710 - 2nd Avenue,Suite 1100,Seattle,USA,,47.603122,-122.333253,WA,98104
1,270 Lafayette Street,Suite 505,New York,USA,,40.723731,-73.996431,NY,10012
2,4900 Hopyard Rd.,Suite 310,Pleasanton,USA,Headquarters,37.692934,-121.904945,CA,94588
3,4900 Hopyard Rd,Suite 310,Pleasanton,USA,Headquarters,37.692934,-121.904945,CA,94588
4,135 Mississippi St,,San Francisco,USA,,37.764726,-122.394523,CA,94107


In [27]:
office_merged.head(9)

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_M$,office
0,0,Wetpaint,web,47,39.8,"{'description': '', 'address1': '710 - 2nd Ave..."
1,0,Wetpaint,web,47,39.8,"{'description': '', 'address1': '270 Lafayette..."
2,1,AdventNet,enterprise,600,0.0,"{'description': 'Headquarters', 'address1': '4..."
3,2,Zoho,software,1600,0.0,"{'description': 'Headquarters', 'address1': '4..."
4,3,Digg,news,60,45.0,"{'description': None, 'address1': '135 Mississ..."
5,4,Facebook,social,5299,2430.0,"{'description': 'Headquarters', 'address1': '1..."
6,4,Facebook,social,5299,2430.0,"{'description': 'Europe HQ', 'address1': '', '..."
7,4,Facebook,social,5299,2430.0,"{'description': 'New York', 'address1': '340 M..."
8,5,Geni,web,18,16.5,"{'description': 'Headquarters', 'address1': '9..."


In [28]:
#Check if there are duplicated values into the 'office' column
find_duplicated = json_normalize(office_merged['office'])
find_duplicated.duplicated().sum()

328

In [29]:
# Deleting duplicates 

def duplicates_remover(df):
    office_merged['duplicates'] = df.astype(str)
    return office_merged.drop_duplicates('duplicates', keep = 'first')

office_merged = duplicates_remover(office_merged['office'])

office_merged = columns_drop(office_merged, 'duplicates')



In [31]:
len(office_merged)

5528

In [34]:
# data
find_duplicated1 = json_normalize(office_merged['office'])
find_duplicated1.duplicated().sum()
len(office_merged)

5528

In [82]:
# len(data['data_string'])

In [112]:
data12['data_string'] = data12['office'].astype(str)
aj = data12.drop_duplicates('data_string')

In [113]:
find_duplicated1 = json_normalize(aj['office'])
find_duplicated1.duplicated().sum()

0

In [98]:
len(aj)

5528

In [99]:
aj

Unnamed: 0,index,name,category_code,number_of_employees,amount_raised_M$,office,data_string
0,0,Wetpaint,web,47,39.80,"{'description': '', 'address1': '710 - 2nd Ave...","{'description': '', 'address1': '710 - 2nd Ave..."
1,0,Wetpaint,web,47,39.80,"{'description': '', 'address1': '270 Lafayette...","{'description': '', 'address1': '270 Lafayette..."
2,1,AdventNet,enterprise,600,0.00,"{'description': 'Headquarters', 'address1': '4...","{'description': 'Headquarters', 'address1': '4..."
3,2,Zoho,software,1600,0.00,"{'description': 'Headquarters', 'address1': '4...","{'description': 'Headquarters', 'address1': '4..."
4,3,Digg,news,60,45.00,"{'description': None, 'address1': '135 Mississ...","{'description': None, 'address1': '135 Mississ..."
5,4,Facebook,social,5299,2430.00,"{'description': 'Headquarters', 'address1': '1...","{'description': 'Headquarters', 'address1': '1..."
6,4,Facebook,social,5299,2430.00,"{'description': 'Europe HQ', 'address1': '', '...","{'description': 'Europe HQ', 'address1': '', '..."
7,4,Facebook,social,5299,2430.00,"{'description': 'New York', 'address1': '340 M...","{'description': 'New York', 'address1': '340 M..."
8,5,Geni,web,18,16.50,"{'description': 'Headquarters', 'address1': '9...","{'description': 'Headquarters', 'address1': '9..."
9,6,Twitter,social,1300,1160.00,"{'description': '', 'address1': '1355 Market S...","{'description': '', 'address1': '1355 Market S..."


In [28]:
# #Function to convert the info within offices into columns.
# def latlong(data):
#     data = data['offices']
#     principal = None
#     if data[0]['latitude'] and data[0]['longitude']:                   #Check there is data
#         principal = {
#             "type":"Point",
#             "coordinates":[data[0]['longitude'], data[0]['latitude']]
#         }

#     return {
#         "totalOffices": len(data),
#         "lat": data[0]['latitude'],
#         "lng": data[0]['longitude'],
#         "main_office (geoquery)": principal
#     }

# data_latlong = data[["offices"]].apply(latlong, result_type="expand", axis=1).dropna()


In [33]:
office_merged2 = office_merged.copy()

In [32]:
e = pd.DataFrame(json_normalize(office_merged['office']))
len(e)

5856

In [35]:
asdf = e.duplicated()

In [38]:
asdf.value_counts()

False    5528
True      328
dtype: int64

In [None]:
#Concatenate all data
data = pd.concat([data, data_latlong], axis = 1)


In [None]:
data = data[[x for x in data.columns if x !='deadpooled' and x != 'offices']]

In [None]:
data.head()

In [None]:
a = json_normalize(data['main_office (geoquery)'][0])
a

In [None]:
data.dtypes

In [None]:
data.fillna(0).head()

In [None]:
data['founded_year'].astype(int)

In [None]:
def fixints(col):
    data[col].fillna(0, inplace = True)
    data[col] = data[col].astype(int)
    return data[col]

data = fixints ('founded_year')

In [None]:
# data_cols = data_companies_1office[['alias_list', 'category_code','description', 'total_money_raised','founded_year','offices', 'deadpooled_year', 'deadpooled_day', 'deadpooled_month',
#        'deadpooled_url' ]]

In [None]:
def latlong(data):
    data = data['offices']
#    return (len(data),data[0]['latitude'],data[0]['longitude'])

    # Only create the geoJSON object if all geodata is available
    principal = None
    if data[0]['latitude'] and data[0]['longitude']:
        principal = {
            "type":"Point",
            "coordinates":[data[0]['longitude'], data[0]['latitude']]
        }

    return {
        "totalOffices": len(data),
        "lat": data[0]['latitude'],
        "lng": data[0]['longitude'],
        "main_office": principal
    }


data_latlong = data_cols[["offices"]].apply(latlong, result_type="expand", axis=1)

In [None]:
data_latlong.dropna(inplace = True)

In [None]:
display(data_latlong.head())


In [None]:
# def latlong(df):
#     df = df['offices']
#     for l in df:
#         if l['latitude'] and l['longitude']:
#             return {
#                     "lat": l['latitude'],
#                     "long": l['longitude']
                    
#                 }

# first_office = data_cols[['offices']].apply(latlong, result_type = "expand", axis=1)
# display(first_office.head())

In [None]:
data_office = pd.concat([data_cols, data_latlong],axis=1)
display(data_office.head())
data_office.shape

In [None]:
data_office.dropna(subset=['lat','lng'], inplace = True)
data_office.shape


In [None]:
# data_companies = data_latlong[['alias_list', 'category_code', 'description', 'total_money_raised', 'founded_year', 'deadpooled_year', 'deadpooled_day', 'deadpooled_month',
#        'deadpooled_url', 'lat', 'long']]
# data_companies.shape

In [None]:
#drop notnull values in deadpool = if it has a value it means the startup is dead.
data2 = data_office[pd.isnull(data_office['deadpooled_year'])]
data3 = data2[pd.isnull(data_office['deadpooled_day'])]
data4 = data3[pd.isnull(data_office['deadpooled_month'])]
data_deads = data4[pd.isnull(data_office['deadpooled_url'])]


data2.shape

In [None]:
display(data_deads.head())

In [None]:
data_companies_clean = data_deads[['alias_list', 'category_code', 'description', 'total_money_raised', 'founded_year', 'main_office', 'lat','lng']]
display(data_companies_clean.head())
data_companies_clean.shape

In [None]:
data_year = data_companies_clean[data_companies_clean['founded_year']>2006]
data_year.shape

In [None]:
data_web = data_year[data_year['category_code'] == 'web']
data_web.shape

In [None]:
data_final = data_web[data_web['total_money_raised'] != '$0']
data_final.shape

In [None]:
#The idea is to develop elaborated backend products to be sold to other web startup. other startups. Because of that, I only select young startups
#which have been funded and are not big enough to develop these products by their own. 

In [None]:
display(data_final.head())

In [None]:
data_final['main_office'][0]

In [None]:
data_final.to_json('/Users/alejandroiborralucas/Desktop/Git-iron/Project crunchbase/visualizing-real-world-data-project/oficinas.json', orient="records")

In [None]:
# def splitMoney(df): 
#     x = re.findall('([^\d+])', df)
#     return x
# data_companies_clean['Currency type'] = data_companies_clean['total_money_raised'].apply(splitMoney)

# #display(data_companies_clean['Currency type'].head())
# display(data_companies_clean.head())

In [None]:
# data_companies_clean['Currency type'].value_counts()

In [None]:
# values=[]
# a = ['$', '€', '£', 'C', 'k']
# for i in data_companies_clean['Currency type']:
#     if i[0] not in values:
#         values.append(i[0])
# print(values)

In [None]:
# #VOY POR AQUI. MONTAR FUNCION USANDO REGEX QUE BUSQUE M Y MULTIPLIQUE POR 1X10^6, ETC. + API PARA TIPOS DE CAMBIO

# def valueMultiplier(df):
#     x = re.findall('(\d+)', df)
    
#     return x

# data_companies_clean['Value Gross'] = data_companies_clean['total_money_raised'].apply(valueMultiplier)
    

In [None]:
#def findCurrency (m):
#    currency_dict = {
#        '$': 1
#        '€': 1.12
#        '£': 1.25
#        'C$': 0.76
#        'kr': 0.11
#    }
#for currency, value in findMonth.items():
#    if currency in m:
#        return int(value)
#    return 0 
#data_companies_clean['Currency type'] = data_companies_clean['total_money_raised'].apply(findCurrency)

In [None]:
# data_companies_clean['a'] = [data_companies_clean['total_money_raised'].str.split('$', expand=True)]

# display(data_companies_clean.head())

In [None]:
# def classifyCurrencies(df):
#     money_type = []
    
#     for i in splitCurrencies(df):
#         if i in currencies:
#             money_type.append(i)
#     return money_type

In [None]:
# #tipos de monedas en el dataset
# currencies_types=[]
# for i in data_companies_clean['total_money_raised']:
#     if i[0:2] not in currencies_types:
#         currencies_types.append(i[0:2])

# print(currencies_types)

In [None]:
# currency=[]
# for i in data_companies_clean['total_money_raised']:
#     currency.append(i[0:2])



In [None]:
# currency_type = pd.DataFrame({'Currency':currency})
# display(currency_type.head())

In [None]:
# def splitCurrencies(df):
#     split = df.str.split('')
#     return split




In [None]:
# def classifyCurrencies(df):
#     money_type = []
#     for i in splitCurrencies(df):
#         if i in currencies:
#             money_type.append(i)
#     return money_type

# classifyCurrencies(data_companies_clean[['total_money_raised']])

In [None]:
# data_companies_clean['currency']= data_companies_clean[['total_money_raised']].apply(splitCurrencies, result_type = 'expand', axis=1)


In [None]:
# display(data_companies_clean.head())