In [1]:
from pymongo import MongoClient
import pandas as pd
import pymongo
from pandas.io.json import json_normalize
import re
import numpy as np


In [2]:
#Connecting the database with the queried data (data_companies_clean)
client = MongoClient('mongodb://localhost:27017/')
db = client.companies
data = db.data_companies_clean


In [3]:
#Query to receive all the data filtering for companies that have at least 1 office.
one_office = db.companies.find({
    "offices":{
    "$not":{
    "$size":0
        }
    }
})



In [4]:
#Create first dataframe and show the columns we have
one_office = pd.DataFrame(one_office)
one_office.columns

Index(['_id', 'acquisition', 'acquisitions', 'alias_list', 'blog_feed_url',
       'blog_url', 'category_code', 'competitions', 'created_at',
       'crunchbase_url', 'deadpooled_day', 'deadpooled_month',
       'deadpooled_url', 'deadpooled_year', 'description', 'email_address',
       'external_links', 'founded_day', 'founded_month', 'founded_year',
       'funding_rounds', 'homepage_url', 'image', 'investments', 'ipo',
       'milestones', 'name', 'number_of_employees', 'offices', 'overview',
       'partners', 'permalink', 'phone_number', 'products', 'providerships',
       'relationships', 'screenshots', 'tag_list', 'total_money_raised',
       'twitter_username', 'updated_at', 'video_embeds'],
      dtype='object')

In [5]:
#Merge deadpool related columns into 1 and fill blanks with NaN values.
one_office['deadpooled'] = one_office[one_office.columns[10:13]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1).replace(r'^\s*$', np.nan, regex=True)


In [6]:
#Select relevant columns for the project
data = pd.DataFrame(one_office[['name', 'category_code', 'founded_year', 'number_of_employees', 'offices','total_money_raised', 'deadpooled']])


In [63]:
#Select alive companies. If they have 'deadpoled' data I understand they are dead. 
data = data[pd.isnull(data['deadpooled'])]
data.head()
 


Unnamed: 0,name,category_code,founded_year,number_of_employees,offices,total_money_raised,deadpooled
0,AdventNet,enterprise,1996.0,600.0,"[{'description': 'Headquarters', 'address1': '...",$0,
1,Zoho,software,2005.0,1600.0,"[{'description': 'Headquarters', 'address1': '...",$0,
2,Wetpaint,web,2005.0,47.0,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M,
4,Postini,web,1999.0,,"[{'description': None, 'address1': '959 Skyway...",$0,
5,Digg,news,2004.0,60.0,"[{'description': None, 'address1': '135 Missis...",$45M,


In [98]:
#Check info provided inside offices column
c = pd.DataFrame(json_normalize(data['offices'][4]))
c


Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,"959 Skyway Road, Suite 200",,San Carlos,USA,,37.506885,-122.247573,CA,94070


In [86]:
data2 = data.copy()

In [103]:
len(data2)

13072

In [108]:

office_split = pd.DataFrame(data2.offices.tolist()).stack().reset_index(level=1, drop=True).rename('office')
office_merged = data2.merge(office_split, left_index=True, right_index=True)


In [109]:
office_merged

Unnamed: 0,name,category_code,founded_year,number_of_employees,offices,total_money_raised,deadpooled,office
0,AdventNet,enterprise,1996.0,600.0,"[{'description': 'Headquarters', 'address1': '...",$0,,"{'description': 'Headquarters', 'address1': '4..."
1,Zoho,software,2005.0,1600.0,"[{'description': 'Headquarters', 'address1': '...",$0,,"{'description': 'Headquarters', 'address1': '4..."
2,Wetpaint,web,2005.0,47.0,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M,,"{'description': '', 'address1': '710 - 2nd Ave..."
2,Wetpaint,web,2005.0,47.0,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M,,"{'description': '', 'address1': '270 Lafayette..."
4,Postini,web,1999.0,,"[{'description': None, 'address1': '959 Skyway...",$0,,"{'description': None, 'address1': '135 Mississ..."
5,Digg,news,2004.0,60.0,"[{'description': None, 'address1': '135 Missis...",$45M,,"{'description': 'Headquarters', 'address1': '9..."
6,Geni,web,2006.0,18.0,"[{'description': 'Headquarters', 'address1': '...",$16.5M,,"{'description': None, 'address1': '8536 Nation..."
7,Flektor,games_video,,,"[{'description': None, 'address1': '8536 Natio...",$0,,"{'description': '', 'address1': '407 N Maple D..."
8,Fox Interactive Media,web,1979.0,0.0,"[{'description': '', 'address1': '407 N Maple ...",$0,,"{'description': None, 'address1': None, 'addre..."
9,Gizmoz,web,2003.0,,"[{'description': None, 'address1': None, 'addr...",$18.1M,,"{'description': 'HQ', 'address1': '539 Bryant ..."


In [107]:
office_merged.reset_index()

Unnamed: 0,index,name,category_code,founded_year,number_of_employees,offices,total_money_raised,deadpooled,office
0,0,AdventNet,enterprise,1996.0,600.0,"[{'description': 'Headquarters', 'address1': '...",$0,,"{'description': 'Headquarters', 'address1': '4..."
1,1,Zoho,software,2005.0,1600.0,"[{'description': 'Headquarters', 'address1': '...",$0,,"{'description': 'Headquarters', 'address1': '4..."
2,2,Wetpaint,web,2005.0,47.0,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M,,"{'description': '', 'address1': '710 - 2nd Ave..."
3,2,Wetpaint,web,2005.0,47.0,"[{'description': '', 'address1': '710 - 2nd Av...",$39.8M,,"{'description': '', 'address1': '270 Lafayette..."
4,4,Postini,web,1999.0,,"[{'description': None, 'address1': '959 Skyway...",$0,,"{'description': None, 'address1': '135 Mississ..."
5,5,Digg,news,2004.0,60.0,"[{'description': None, 'address1': '135 Missis...",$45M,,"{'description': 'Headquarters', 'address1': '9..."
6,6,Geni,web,2006.0,18.0,"[{'description': 'Headquarters', 'address1': '...",$16.5M,,"{'description': None, 'address1': '8536 Nation..."
7,7,Flektor,games_video,,,"[{'description': None, 'address1': '8536 Natio...",$0,,"{'description': '', 'address1': '407 N Maple D..."
8,8,Fox Interactive Media,web,1979.0,0.0,"[{'description': '', 'address1': '407 N Maple ...",$0,,"{'description': None, 'address1': None, 'addre..."
9,9,Gizmoz,web,2003.0,,"[{'description': None, 'address1': None, 'addr...",$18.1M,,"{'description': 'HQ', 'address1': '539 Bryant ..."


In [101]:
len(office_merged)

15147

In [100]:
da = pd.DataFrame(json_normalize(ndf['office'][4]))
da

Unnamed: 0,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,135 Mississippi St,,San Francisco,USA,,37.764726,-122.394523,CA,94107


In [9]:
#Function to convert the info within offices into columns.
def latlong(data):
    data = data['offices']
    principal = None
    if data[0]['latitude'] and data[0]['longitude']:                   #Check there is data
        principal = {
            "type":"Point",
            "coordinates":[data[0]['longitude'], data[0]['latitude']]
        }

    return {
        "totalOffices": len(data),
        "lat": data[0]['latitude'],
        "lng": data[0]['longitude'],
        "main_office (geoquery)": principal
    }

data_latlong = data[["offices"]].apply(latlong, result_type="expand", axis=1).dropna()


In [10]:
#Concatenate all data
data = pd.concat([data, data_latlong], axis = 1)


In [11]:
data = data[[x for x in data.columns if x !='deadpooled' and x != 'offices']]

In [12]:
data.head()

Unnamed: 0,name,category_code,founded_year,number_of_employees,total_money_raised,lat,lng,main_office (geoquery),totalOffices
0,AdventNet,enterprise,1996.0,600.0,$0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
1,Zoho,software,2005.0,1600.0,$0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
2,Wetpaint,web,2005.0,47.0,$39.8M,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",2.0
4,Postini,web,1999.0,,$0,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",1.0
5,Digg,news,2004.0,60.0,$45M,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,...",1.0


In [20]:
a = json_normalize(data['main_office (geoquery)'][0])
a

Unnamed: 0,coordinates,type
0,"[-121.904945, 37.692934]",Point


In [17]:
data.dtypes

name                       object
category_code              object
founded_year              float64
number_of_employees       float64
total_money_raised         object
lat                       float64
lng                       float64
main_office (geoquery)     object
totalOffices              float64
dtype: object

In [20]:
data.fillna(0).head()

Unnamed: 0,name,category_code,founded_year,number_of_employees,total_money_raised,lat,lng,main_office (geoquery),totalOffices
0,AdventNet,enterprise,1996.0,600.0,$0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
1,Zoho,software,2005.0,1600.0,$0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
2,Wetpaint,web,2005.0,47.0,$39.8M,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",2.0
4,Postini,web,1999.0,0.0,$0,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",1.0
5,Digg,news,2004.0,60.0,$45M,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,...",1.0


In [21]:
data['founded_year'].astype(int)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [9]:
def fixints(col):
    data[col].fillna(0, inplace = True)
    data[col] = data[col].astype(int)
    return data[col]

data = fixints ('founded_year')

0    1996
1    2005
2    2005
3    2005
4    1999
Name: founded_year, dtype: int64

In [10]:
# data_cols = data_companies_1office[['alias_list', 'category_code','description', 'total_money_raised','founded_year','offices', 'deadpooled_year', 'deadpooled_day', 'deadpooled_month',
#        'deadpooled_url' ]]

In [10]:
def latlong(data):
    data = data['offices']
#    return (len(data),data[0]['latitude'],data[0]['longitude'])

    # Only create the geoJSON object if all geodata is available
    principal = None
    if data[0]['latitude'] and data[0]['longitude']:
        principal = {
            "type":"Point",
            "coordinates":[data[0]['longitude'], data[0]['latitude']]
        }

    return {
        "totalOffices": len(data),
        "lat": data[0]['latitude'],
        "lng": data[0]['longitude'],
        "main_office": principal
    }


data_latlong = data_cols[["offices"]].apply(latlong, result_type="expand", axis=1)

In [11]:
data_latlong.dropna(inplace = True)

In [12]:
display(data_latlong.head())


Unnamed: 0,lat,lng,main_office,totalOffices
0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
1,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
2,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",2.0
4,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",1.0
5,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,...",1.0


In [13]:
# def latlong(df):
#     df = df['offices']
#     for l in df:
#         if l['latitude'] and l['longitude']:
#             return {
#                     "lat": l['latitude'],
#                     "long": l['longitude']
                    
#                 }

# first_office = data_cols[['offices']].apply(latlong, result_type = "expand", axis=1)
# display(first_office.head())

In [14]:
data_office = pd.concat([data_cols, data_latlong],axis=1)
display(data_office.head())
data_office.shape

Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,offices,deadpooled_year,deadpooled_day,deadpooled_month,deadpooled_url,lat,lng,main_office,totalOffices
0,Zoho ManageEngine,enterprise,Server Management Software,$0,1996,"[{'description': 'Headquarters', 'address1': '...",2.0,,,,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
1,,software,Online Business Apps Suite,$0,2005,"[{'description': 'Headquarters', 'address1': '...",3.0,,,,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,...",1.0
2,,web,Technology Platform Company,$39.8M,2005,"[{'description': '', 'address1': '710 - 2nd Av...",1.0,,,,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,...",2.0
3,,network_hosting,,$800k,2005,"[{'description': '', 'address1': 'Suite 200', ...",2008.0,15.0,9.0,,,,,
4,,web,,$0,1999,"[{'description': None, 'address1': '959 Skyway...",,,,,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",1.0


(13744, 14)

In [15]:
data_office.dropna(subset=['lat','lng'], inplace = True)
data_office.shape


(9618, 14)

In [16]:
# data_companies = data_latlong[['alias_list', 'category_code', 'description', 'total_money_raised', 'founded_year', 'deadpooled_year', 'deadpooled_day', 'deadpooled_month',
#        'deadpooled_url', 'lat', 'long']]
# data_companies.shape

In [17]:
#drop notnull values in deadpool = if it has a value it means the startup is dead.
data2 = data_office[pd.isnull(data_office['deadpooled_year'])]
data3 = data2[pd.isnull(data_office['deadpooled_day'])]
data4 = data3[pd.isnull(data_office['deadpooled_month'])]
data_deads = data4[pd.isnull(data_office['deadpooled_url'])]


data2.shape

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


(8983, 14)

In [18]:
display(data_deads.head())

Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,offices,deadpooled_year,deadpooled_day,deadpooled_month,deadpooled_url,lat,lng,main_office,totalOffices
4,,web,,$0,1999,"[{'description': None, 'address1': '959 Skyway...",,,,,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",1.0
5,,news,user driven social content website,$45M,2004,"[{'description': None, 'address1': '135 Missis...",,,,,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,...",1.0
6,,web,Geneology social network site,$16.5M,2006,"[{'description': 'Headquarters', 'address1': '...",,,,,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",1.0
7,,games_video,,$0,0,"[{'description': None, 'address1': '8536 Natio...",,,,,34.025958,-118.379768,"{'type': 'Point', 'coordinates': [-118.379768,...",1.0
9,,web,Photo animation,$18.1M,2003,"[{'description': None, 'address1': None, 'addr...",,,,,37.48413,-122.169472,"{'type': 'Point', 'coordinates': [-122.169472,...",1.0


In [19]:
data_companies_clean = data_deads[['alias_list', 'category_code', 'description', 'total_money_raised', 'founded_year', 'main_office', 'lat','lng']]
display(data_companies_clean.head())
data_companies_clean.shape

Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,main_office,lat,lng
4,,web,,$0,1999,"{'type': 'Point', 'coordinates': [-122.247573,...",37.506885,-122.247573
5,,news,user driven social content website,$45M,2004,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523
6,,web,Geneology social network site,$16.5M,2006,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064
7,,games_video,,$0,0,"{'type': 'Point', 'coordinates': [-118.379768,...",34.025958,-118.379768
9,,web,Photo animation,$18.1M,2003,"{'type': 'Point', 'coordinates': [-122.169472,...",37.48413,-122.169472


(8859, 8)

In [20]:
data_year = data_companies_clean[data_companies_clean['founded_year']>2006]
data_year.shape

(2917, 8)

In [21]:
data_web = data_year[data_year['category_code'] == 'web']
data_web.shape

(813, 8)

In [22]:
data_final = data_web[data_web['total_money_raised'] != '$0']
data_final.shape

(154, 8)

In [23]:
#The idea is to develop elaborated backend products to be sold to other web startup. other startups. Because of that, I only select young startups
#which have been funded and are not big enough to develop these products by their own. 

In [24]:
display(data_final.head())

Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,main_office,lat,lng
28,,web,Life long learning content,$21M,2007,"{'type': 'Point', 'coordinates': [-118.487267,...",34.017606,-118.487267
64,,web,,$100k,2007,"{'type': 'Point', 'coordinates': [-119.306607,...",37.269175,-119.306607
69,,web,Online Community and Discussion,$12.7M,2007,"{'type': 'Point', 'coordinates': [-119.306607,...",37.269175,-119.306607
92,,web,Social network for desktop software,$1M,2007,"{'type': 'Point', 'coordinates': [4.8948623, 5...",52.374523,4.894862
122,,web,Video guide for hotels,$13.8M,2007,"{'type': 'Point', 'coordinates': [2.350987, 48...",48.856667,2.350987


In [25]:
data_final['main_office'][0]

KeyError: 0

In [68]:
data_final.to_json('/Users/alejandroiborralucas/Desktop/Git-iron/Project crunchbase/visualizing-real-world-data-project/oficinas.json', orient="records")

In [19]:
# def splitMoney(df): 
#     x = re.findall('([^\d+])', df)
#     return x
# data_companies_clean['Currency type'] = data_companies_clean['total_money_raised'].apply(splitMoney)

# #display(data_companies_clean['Currency type'].head())
# display(data_companies_clean.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,lat,lng,Currency type
4,,web,,$0,1999,37.506885,-122.247573,[$]
5,,news,user driven social content website,$45M,2004,37.764726,-122.394523,"[$, M]"
6,,web,Geneology social network site,$16.5M,2006,34.090368,-118.393064,"[$, ., M]"
7,,games_video,,$0,0,34.025958,-118.379768,[$]
9,,web,Photo animation,$18.1M,2003,37.48413,-122.169472,"[$, ., M]"


In [20]:
# data_companies_clean['Currency type'].value_counts()

[$]             5927
[$, ., M]       1421
[$, M]           991
[$, k]           328
[€, k]            50
[€, ., M]         44
[€, M]            36
[£, ., M]         16
[£, k]            14
[£, M]            10
[C, $, ., M]       6
[$, ., B]          5
[C, $, k]          4
[C, $, M]          4
[$, B]             1
[$, ., k]          1
[k, r, M]          1
Name: Currency type, dtype: int64

In [21]:
# values=[]
# a = ['$', '€', '£', 'C', 'k']
# for i in data_companies_clean['Currency type']:
#     if i[0] not in values:
#         values.append(i[0])
# print(values)

['$', '€', '£', 'C', 'k']


In [37]:
# #VOY POR AQUI. MONTAR FUNCION USANDO REGEX QUE BUSQUE M Y MULTIPLIQUE POR 1X10^6, ETC. + API PARA TIPOS DE CAMBIO

# def valueMultiplier(df):
#     x = re.findall('(\d+)', df)
    
#     return x

# data_companies_clean['Value Gross'] = data_companies_clean['total_money_raised'].apply(valueMultiplier)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [50]:
#def findCurrency (m):
#    currency_dict = {
#        '$': 1
#        '€': 1.12
#        '£': 1.25
#        'C$': 0.76
#        'kr': 0.11
#    }
#for currency, value in findMonth.items():
#    if currency in m:
#        return int(value)
#    return 0 
#data_companies_clean['Currency type'] = data_companies_clean['total_money_raised'].apply(findCurrency)

In [69]:
# data_companies_clean['a'] = [data_companies_clean['total_money_raised'].str.split('$', expand=True)]

# display(data_companies_clean.head())

In [None]:
# def classifyCurrencies(df):
#     money_type = []
    
#     for i in splitCurrencies(df):
#         if i in currencies:
#             money_type.append(i)
#     return money_type

In [28]:
# #tipos de monedas en el dataset
# currencies_types=[]
# for i in data_companies_clean['total_money_raised']:
#     if i[0:2] not in currencies_types:
#         currencies_types.append(i[0:2])

# print(currencies_types)

['$0', '$4', '$1', '$2', '$7', '$5', '$6', '$3', '$8', '$9', '€1', '£3', '€3', '€5', '€4', '£2', 'C$', '€8', '€2', '€9', '£1', '£4', '€6', '€7', '£5', '£7', 'kr', '£9', '£6', '£8']


In [49]:
# currency=[]
# for i in data_companies_clean['total_money_raised']:
#     currency.append(i[0:2])



In [48]:
# currency_type = pd.DataFrame({'Currency':currency})
# display(currency_type.head())

Unnamed: 0,Currency
0,$0
1,$4
2,$1
3,$0
4,$0


In [30]:
# def splitCurrencies(df):
#     split = df.str.split('')
#     return split




In [31]:
# def classifyCurrencies(df):
#     money_type = []
#     for i in splitCurrencies(df):
#         if i in currencies:
#             money_type.append(i)
#     return money_type

# classifyCurrencies(data_companies_clean[['total_money_raised']])

AttributeError: 'DataFrame' object has no attribute 'str'

In [32]:
# data_companies_clean['currency']= data_companies_clean[['total_money_raised']].apply(splitCurrencies, result_type = 'expand', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
# display(data_companies_clean.head())

Unnamed: 0,alias_list,category_code,description,total_money_raised,founded_year,lat,long,currency
4,,web,,$0,1999,37.506885,-122.247573,"[, $, 0, ]"
5,,news,user driven social content website,$45M,2004,37.764726,-122.394523,"[, $, 4, 5, M, ]"
6,,web,Geneology social network site,$16.5M,2006,34.090368,-118.393064,"[, $, 1, 6, ., 5, M, ]"
7,,games_video,,$0,0,34.025958,-118.379768,"[, $, 0, ]"
8,,web,,$0,1979,34.076179,-118.39417,"[, $, 0, ]"


[]
