# Primary Claim, Part I: Company Count

## Importing Dependencies

In [None]:
'''
pandas will be our data manipulation module
'''
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_rows', 200)

'''
numpy will be our array computing module
'''
import numpy as np

'''
tqdm allows us to easily add progress bars to our processes
'''
from tqdm import tqdm

'''
display will allow us to easily display custom data types like dataframes
'''
from IPython.display import display

'''
built-in python modules
'''
import os
import string

## Loading in Data

In [3]:
organizations = pd.read_csv('./organizations/organizations_filtered_cl.csv')
organizations = organizations.drop(columns=['country_code', 'state_code', 'category_groups_list', 'primary_role'])
organizations.dropna(subset=['founded_on'])
organizations

Unnamed: 0,uuid,name,status,category_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on
0,74a20af3-f4dd-6188-de60-c4ee6cd0ca4a,Ant Group,operating,"Banking,Financial Services,FinTech,Payments",4.0,2.200000e+10,2014-10-01,2018-06-08
1,ba2dfa91-ce6d-2347-3b74-4ffdffa1b7ee,Gazprom,ipo,"Business Intelligence,Energy,Energy Efficiency...",3.0,1.476479e+10,1989-01-01,2019-12-24
2,47f9688f-00a9-23f9-3a64-179bc6dd31d4,ByteDance,operating,"Artificial Intelligence,Content,Data Mining,Ma...",12.0,9.405000e+09,2012-03-01,2020-12-11
3,416156d2-f0de-7c42-8303-6eaeab697c26,Robinhood,operating,"Cryptocurrency,Financial Services,FinTech,Stoc...",23.0,5.574325e+09,2013-01-01,2021-02-01
4,fc254a10-f558-2813-9435-45c34bd6ec3b,Epic Games,operating,"Developer Platform,Gaming,Software,Video Games",8.0,5.110000e+09,1991-01-01,2021-04-13
...,...,...,...,...,...,...,...,...
37251,3947e7b5-8a6b-4fb1-bd7f-d8f5507401d4,Get Help Tax and Bookkeeping,operating,"Accounting,Consulting,Finance,Financial Servic...",,,,
37252,ee15f0de-46a3-49a7-9ddd-d9738bb92f84,ottomon,operating,"Augmented Reality,Internet of Things",,,2017-12-27,
37253,2708485a-8af1-4f77-a244-597603ace22d,Mitivate,operating,"Analytics,Health Care,Machine Learning",,,,
37254,1773734f-75b9-448f-8e25-25ac9011ee98,Futura VR Studio,operating,"Electronics,Information Technology,Video,Video...",,,2016-01-01,


## Planning Data Structures

### Gauging Number of Distinct Years

In [4]:
last_funded_list = organizations.founded_on.tolist()
distinct_years = []

for i in last_funded_list:
    year = str(i)[:4]
    if year not in distinct_years and year != 'nan':
        distinct_years.append(year)

distinct_years.sort()

print(f'There are {len(distinct_years)} distinct years.')
print(f'The min year is {min(distinct_years)}.')
print(f'The max year is {max(distinct_years)}.')

There are 181 distinct years.
The min year is 1472.
The max year is 2021.


### Filtering Out Irrelevant Years

Despite the large range of years in the founded_on column, we will only use 2016-2020, since the growth in the years prior is irrelevant in predicting the future success of a vertical. We will not be using new companies in 2021, since the year is not over and therefore the data is incomplete.

In [5]:
years = [*range(2016, 2021, 1)]
organizations = organizations.loc[ (organizations['founded_on'] > '2015-12-31') & (organizations['founded_on'] < '2021-01-01') ]
organizations = organizations.reset_index(drop=True)
organizations.index.size

8258

### Setting Up Data Structure with Appropriate Years and Verticals

We will create a dictionary in the format {'vertical': {1983: company_count, 1984: company_count...}}

In [6]:
verticals = ['Machine Learning', 'FinTech', 'Artificial Intelligence', 'Video Games',
            'Renewable Energy', 'Cyber Security', 'Social Network', 'Virtual Reality',
            'Cryptocurrency', 'Augmented Reality']

# verticals = ['Machine Learning', 'FinTech', 'Artificial Intelligence', 'Video Games',
#             'Renewable Energy', 'Cyber Security', 'Social Network', 'Virtual Reality',
#             'Cryptocurrency', 'Augmented Reality', 'Information Technology', 'Biotechnology',
#             'Social Media', 'Telecommunications', 'Cloud Computing', 'EdTech',
#             'Network Security', 'Cloud Data Services', 'Semiconductor', 'Predictive Analytics']

vertical_growth = {}

for i in verticals:
    vertical_growth[i] = {}

for i in vertical_growth:
    for j in years:
        vertical_growth[i][j] = 0

vertical_growth['Machine Learning']

{2016: 0, 2017: 0, 2018: 0, 2019: 0, 2020: 0}

Now, we just need to iterate through the dataframe and update the year counts.

## Updating Year Counts per Vertical

Process:
- iterate through every entry in the dataframe
    - iterate through every vertical in verticals
        - check if category_list contains vertical
            - grab the year off of founded_on and increment the count (vertical_growth[vertical][year] += 1)


In [7]:
for i in tqdm(organizations.index.values, desc='Updating Counts', unit=' entries'):
    for j in verticals:
        if j in str( organizations.iloc[i]['category_list'] ).split(','):
            if str( organizations.iloc[i]['founded_on'] )[:4] != 'nan':
                vertical_growth[ j ][ int( str( organizations.iloc[i]['founded_on'] )[:4] ) ] += 1

Updating Counts: 100%|██████████| 8258/8258 [00:35<00:00, 229.45 entries/s]


### Visualizing our Results as a Dataframe

In [8]:
pd.DataFrame.from_dict(vertical_growth)

Unnamed: 0,Machine Learning,FinTech,Artificial Intelligence,Video Games,Renewable Energy,Cyber Security,Social Network,Virtual Reality,Cryptocurrency,Augmented Reality
2016,967,1116,1549,181,136,357,42,418,168,239
2017,833,923,1376,114,84,265,20,211,515,156
2018,190,244,311,34,20,63,6,41,168,37
2019,22,34,42,6,3,8,1,4,6,3
2020,5,12,10,1,2,9,0,2,0,1


# Primary Claim, Part II: Ratio Computing

## Loading in Funding Rounds Data

In [9]:
rounds = pd.read_csv('./funding_rounds/merged_orgfr_cl.csv')
# rounds = pd.read_csv('./funding_rounds/merged_orgfr_clext.csv')
rounds = rounds.drop(columns=['Unnamed: 0', 'country_code', 'state_code', 'status', 'category_groups_list',
                            'primary_role'])
rounds = rounds.reset_index(drop=True)
rounds

Unnamed: 0,uuid,name,category_list,num_funding_rounds,total_funding_usd,founded_on,investor_count,last_funding_on,investment_type,announced_on,raised_amount_usd
0,416156d2-f0de-7c42-8303-6eaeab697c26,Robinhood,"Cryptocurrency,Financial Services,FinTech,Stoc...",23.0,5.574325e+09,2013-01-01,15.0,2021-02-01,seed,2013-12-01,3000000
1,fc254a10-f558-2813-9435-45c34bd6ec3b,Epic Games,"Developer Platform,Gaming,Software,Video Games",8.0,5.110000e+09,1991-01-01,16.0,2021-04-13,seed,2021-04-13,1000000000
2,2cc3a5de-2303-aa00-cd1a-50bd96420392,Klarna,"E-Commerce,Finance,FinTech,Payments",25.0,3.090713e+09,2005-02-01,1.0,2021-05-20,pre_seed,2005-02-01,78212
3,6f83ddd7-d637-61f8-06b2-438a0037605f,Stripe,"Finance,FinTech,Mobile Payments,SaaS",15.0,2.235000e+09,2010-01-01,6.0,2021-03-14,seed,2011-03-28,2000000
4,900e276f-746c-3883-9535-2501c38db939,Preferred Networks,"Artificial Intelligence,Biotechnology,Internet...",7.0,2.052376e+09,2014-03-26,1.0,2019-06-25,seed,2014-10-01,1800000
...,...,...,...,...,...,...,...,...,...,...,...
7041,b6240739-c30a-4a54-8808-b546665a401d,Crypto Cashout,"Bitcoin,Blockchain,Cryptocurrency,Ethereum,Fin...",1.0,1.000000e+04,2018-01-01,1.0,2018-05-22,pre_seed,2018-05-22,10000
7042,d3a2b15f-c828-92cd-bbab-5e8423173ddd,Pin Your Client,"Analytics,Artificial Intelligence,Information ...",1.0,9.801000e+03,2014-01-01,2.0,2014-07-19,seed,2014-07-19,9801
7043,1b15d1b2-035a-058a-c968-04d71f974407,RuangLaptop,"Computer,Internet,Mobile,Video Games",2.0,8.014000e+03,2015-10-01,1.0,2019-10-01,seed,2019-10-01,2823
7044,fe4a843e-7c86-599a-abf5-45a61f557827,Coyno,"Accounting,Bitcoin,FinTech,Software",1.0,3.116000e+03,2014-09-19,1.0,2014-11-27,seed,2014-11-27,3116


## Filtering out Irrelevant Founded-On and Announced-On Dates

In [10]:
rounds = rounds.loc[ (rounds['founded_on'] > '2015-12-31') & (rounds['founded_on'] < '2021-01-01') ]
rounds = rounds.loc[ (rounds['announced_on'] > '2015-12-31') & (rounds['announced_on'] < '2021-01-01') ]
rounds = rounds.reset_index(drop=True)
rounds.index.size

2502

## Setting up Data Structure by Vertical, Year, and Investment Type

In [11]:
vertical_investment = {}

for i in verticals:
    vertical_investment[i] = {}

for i in verticals:
    for j in years:
        vertical_investment[i][j] = {}

for i in verticals:
    for j in years:
        vertical_investment[i][j]['seed'] = 0
        vertical_investment[i][j]['pre_seed'] = 0

vertical_investment['Machine Learning']

{2016: {'seed': 0, 'pre_seed': 0},
 2017: {'seed': 0, 'pre_seed': 0},
 2018: {'seed': 0, 'pre_seed': 0},
 2019: {'seed': 0, 'pre_seed': 0},
 2020: {'seed': 0, 'pre_seed': 0}}

## Updating Seed/Pre-Seed Funding Rounds Count

In [12]:
for i in tqdm(rounds.index.values, desc='Updating Counts', unit=' entries'):
    for j in verticals:
        if j in str( rounds.iloc[i]['category_list'] ).split(','):
            vertical_investment[j][ int(str(rounds.iloc[i]['announced_on'])[:4]) ][str( rounds.iloc[i]['investment_type'] )] += 1

Updating Counts: 100%|██████████| 2502/2502 [00:10<00:00, 229.06 entries/s]


### Visualizing our Results as a Dataframe

In [13]:
pd.DataFrame.from_dict({(i,j): vertical_investment[i][j] for i in vertical_investment.keys() 
                        for j in vertical_investment[i].keys()}).fillna(0)

Unnamed: 0_level_0,Machine Learning,Machine Learning,Machine Learning,Machine Learning,Machine Learning,FinTech,FinTech,FinTech,FinTech,FinTech,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Video Games,Video Games,Video Games,Video Games,Video Games,Renewable Energy,Renewable Energy,Renewable Energy,Renewable Energy,Renewable Energy,Cyber Security,Cyber Security,Cyber Security,Cyber Security,Cyber Security,Social Network,Social Network,Social Network,Social Network,Social Network,Virtual Reality,Virtual Reality,Virtual Reality,Virtual Reality,Virtual Reality,Cryptocurrency,Cryptocurrency,Cryptocurrency,Cryptocurrency,Cryptocurrency,Augmented Reality,Augmented Reality,Augmented Reality,Augmented Reality,Augmented Reality
Unnamed: 0_level_1,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020
seed,87,190,181,125,74,65,166,167,116,67,124,276,280,170,109,11,16,17,11,8,5,15,12,11,1,7,34,51,31,13,0,3,3,2,1,22,30,26,21,12,7,22,45,13,7,14,29,25,30,14
pre_seed,13,49,54,20,15,15,37,59,32,7,30,75,83,44,20,0,6,9,1,0,1,5,2,1,0,6,8,13,6,1,1,1,2,1,0,9,9,12,0,1,1,4,13,9,1,6,9,11,2,2


## Setting up Data Structure for Funding Rounds:New Companies, and Computing Ratios

In [14]:
vertical_ratios = {}

for i in verticals:
    vertical_ratios[i] = {}

for i in verticals:
    for j in years:
        vertical_ratios[i][j] = {}

for i in verticals:
    for j in years:
        if vertical_growth[i][j] != 0:
            vertical_ratios[i][j]['Seed Ratio'] = round(vertical_investment[i][j]['seed']/vertical_growth[i][j], 2)
            vertical_ratios[i][j]['Pre-Seed Ratio'] = round(vertical_investment[i][j]['pre_seed']/vertical_growth[i][j], 2)
        else:
            vertical_ratios[i][j]['Seed Ratio'] = 0.00
            vertical_ratios[i][j]['Pre-Seed Ratio'] = 0.00

pd.DataFrame.from_dict({(i,j): vertical_ratios[i][j] for i in vertical_ratios.keys() 
                        for j in vertical_ratios[i].keys()}).fillna(0)

Unnamed: 0_level_0,Machine Learning,Machine Learning,Machine Learning,Machine Learning,Machine Learning,FinTech,FinTech,FinTech,FinTech,FinTech,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Artificial Intelligence,Video Games,Video Games,Video Games,Video Games,Video Games,Renewable Energy,Renewable Energy,Renewable Energy,Renewable Energy,Renewable Energy,Cyber Security,Cyber Security,Cyber Security,Cyber Security,Cyber Security,Social Network,Social Network,Social Network,Social Network,Social Network,Virtual Reality,Virtual Reality,Virtual Reality,Virtual Reality,Virtual Reality,Cryptocurrency,Cryptocurrency,Cryptocurrency,Cryptocurrency,Cryptocurrency,Augmented Reality,Augmented Reality,Augmented Reality,Augmented Reality,Augmented Reality
Unnamed: 0_level_1,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020
Seed Ratio,0.09,0.23,0.95,5.68,14.8,0.06,0.18,0.68,3.41,5.58,0.08,0.2,0.9,4.05,10.9,0.06,0.14,0.5,1.83,8.0,0.04,0.18,0.6,3.67,0.5,0.02,0.13,0.81,3.88,1.44,0.0,0.15,0.5,2.0,0.0,0.05,0.14,0.63,5.25,6.0,0.04,0.04,0.27,2.17,0.0,0.06,0.19,0.68,10.0,14.0
Pre-Seed Ratio,0.01,0.06,0.28,0.91,3.0,0.01,0.04,0.24,0.94,0.58,0.02,0.05,0.27,1.05,2.0,0.0,0.05,0.26,0.17,0.0,0.01,0.06,0.1,0.33,0.0,0.02,0.03,0.21,0.75,0.11,0.02,0.05,0.33,1.0,0.0,0.02,0.04,0.29,0.0,0.5,0.01,0.01,0.08,1.5,0.0,0.03,0.06,0.3,0.67,2.0
