# Claim 1 V2

In [2]:
'''
pandas will be our data manipulation module
'''
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_rows', 200)

'''
numpy will be our array computing module
'''
import numpy as np

'''
tqdm allows us to easily add progress bars to our processes
'''
from tqdm import tqdm

'''
display will allow us to easily display custom data types like dataframes
'''
from IPython.display import display

'''
built-in python modules
'''
import os
import string

# Loading Dataframes

In [4]:
organizations = pd.read_csv('./Orgs/organizations_filtered_v2.csv')
organizations = organizations.drop(columns=['country_code', 'state_code', 'status',
                                            'num_funding_rounds', 'primary_role'])
organizations.dropna(subset=['founded_on'])
organizations

Unnamed: 0,uuid,name,category_list,category_groups_list,total_funding_usd,founded_on,last_funding_on
0,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,"Cloud Computing,Collaboration,CRM,Developer To...","Administrative Services,Information Technology...",,1996-09-15,
1,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,"Enterprise Software,Financial Services,Venture...","Financial Services,Lending and Investments,Sof...",,2004-01-01,
2,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,"Mobile Apps,Photo Sharing,Social Media,Social ...","Apps,Content and Publishing,Internet Services,...",1.612282e+10,2004-02-04,2014-10-20
3,7ca12f7a-2f8e-48b4-a8d1-1a33a0e275b9,Trinity Ventures,"Finance,SaaS,Venture Capital","Financial Services,Lending and Investments,Sof...",,1986-01-01,
4,b08efc27-da40-505a-6f9d-c9e14247bf36,Accel,"Finance,Financial Services,Venture Capital","Financial Services,Lending and Investments",,1983-01-01,
...,...,...,...,...,...,...,...
487727,8cbdf36e-82ad-43d2-8f68-2aee9d52de0e,Silvercore Partners,"Real Estate,Venture Capital","Financial Services,Lending and Investments,Rea...",,2018-01-02,
487728,fed1a94f-cec1-42eb-a2e1-906fc199dbf3,FoodsPass,"FinTech,Food and Beverage,Personal Finance","Financial Services,Food and Beverage",,2021-04-05,
487729,9b977e53-f9b5-49f5-8f73-732acdbdd21f,Jenova,"Artificial Intelligence,Financial Services,Mac...","Artificial Intelligence,Data and Analytics,Fin...",,2019-01-01,2020-03-01
487730,10bde81f-c45c-4c7d-911d-db8876f55798,Velocitee Labs,"Manufacturing,Security,Transportation","Manufacturing,Privacy and Security,Transportation",,2020-01-01,


# Planning Data Structures

In [5]:
last_funded_list = organizations.founded_on.tolist()
distinct_years = []

for i in last_funded_list:
    year = str(i)[:4]
    if year not in distinct_years and year != 'nan':
        distinct_years.append(year)

distinct_years.sort()

print(f'There are {len(distinct_years)} distinct years.')
print(f'The min year is {min(distinct_years)}.')
print(f'The max year is {max(distinct_years)}.')

There are 320 distinct years.
The min year is 1066.
The max year is 2021.


Despite the large range of years in the founded_on column, we will only use 2016-2020, since the growth in the years prior is irrelevant in predicting the future success of a vertical. We will not be using new companies in 2021, since the year is not over and therefore the data is incomplete.

In [6]:
years = [*range(2016, 2021, 1)]
organizations = organizations.loc[ (organizations['founded_on'] > '2015-12-31') & (organizations['founded_on'] < '2021-01-01') ]
organizations = organizations.reset_index(drop=True)

We will create a dictionary in the format {'vertical': {1983: company_count, 1984: company_count...}}

In [7]:
verticals = ['Privacy and Security', 'Mobile', 'Financial Services', 'Software', 'Artificial Intelligence',
            'Information Technology', 'Navigation and Mapping']

vertical_growth = {}

for i in verticals:
    vertical_growth[i] = {}

for i in vertical_growth:
    for j in years:
        vertical_growth[i][j] = 0

Now, we just need to iterate through the dataframe and update the year counts.

# Updating Year Counts per Vertical

Process:
- iterate through every entry in the dataframe
    - iterate through every vertical in verticals
        - check if category_list contains vertical
            - grab the year off of founded_on and increment the count (vertical_growth[vertical][year] += 1)


In [8]:
for i in organizations.index.values:
    for j in verticals:
        if j in str( organizations.iloc[i]['category_groups_list'] ).split(','):
            if str( organizations.iloc[i]['founded_on'] )[:4] != 'nan':
                vertical_growth[ j ][ int( str( organizations.iloc[i]['founded_on'] )[:4] ) ] += 1

Next, we get rid of years with count 0.

In [9]:
for key, value in vertical_growth.items():
    for k in list(value.keys()):
        if value[k] == 0:
            del vertical_growth[key][k]

In [10]:
vertical_growth

{'Privacy and Security': {2016: 1242,
  2017: 1189,
  2018: 992,
  2019: 588,
  2020: 364},
 'Mobile': {2016: 4083, 2017: 3261, 2018: 2478, 2019: 1392, 2020: 783},
 'Financial Services': {2016: 6257,
  2017: 6818,
  2018: 5764,
  2019: 3466,
  2020: 2168},
 'Software': {2016: 18152, 2017: 17701, 2018: 14585, 2019: 9446, 2020: 5758},
 'Artificial Intelligence': {2016: 3110,
  2017: 3583,
  2018: 2977,
  2019: 1925,
  2020: 965},
 'Information Technology': {2016: 10575,
  2017: 10027,
  2018: 8268,
  2019: 5242,
  2020: 2889},
 'Navigation and Mapping': {2016: 458,
  2017: 379,
  2018: 265,
  2019: 147,
  2020: 39}}

In [11]:
pd.DataFrame.from_dict(vertical_growth)

Unnamed: 0,Privacy and Security,Mobile,Financial Services,Software,Artificial Intelligence,Information Technology,Navigation and Mapping
2016,1242,4083,6257,18152,3110,10575,458
2017,1189,3261,6818,17701,3583,10027,379
2018,992,2478,5764,14585,2977,8268,265
2019,588,1392,3466,9446,1925,5242,147
2020,364,783,2168,5758,965,2889,39
