# Raising Series A Fund Prediction
InReach Venture's technical interview task: predict if a company will raise series A, with a stretch to predict how much.

Here I will take the necessary steps to make this happen. I will start by importing the necessary packages. Next I will load the data and conduct exploratory data analysis (EDA) this is critical when deciding on which machine specific features which is then fed intomy machine learning model. I'd like to then do a quick accuracy measure using an ROC curve to see the performance of the chosen machine learning algorithm.

I will then do a quick summary of my findings.

In [1]:
#importing necessary packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import statsmodels.api as sm
from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.linear_model import LinearRegression

  import pandas.util.testing as tm


In [2]:
#importing the necessary data
funding_rounds = pd.read_csv('../venturecapitalproject/data/funding_rounds.csv', delimiter=',', quotechar='"', escapechar='\\')
objects = pd.read_csv('../venturecapitalproject/data/objects.csv',  delimiter=',', quotechar='"', escapechar='\\')

In [3]:
funding_rounds.head()

Unnamed: 0,id,funding_round_id,object_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,raised_amount,raised_currency_code,pre_money_valuation_usd,...,post_money_valuation,post_money_currency_code,participants,is_first_round,is_last_round,source_url,source_description,created_by,created_at,updated_at
0,1,1,c:4,2006-12-01,series-b,b,8500000,8500000,USD,N,...,N,N,2,0,0,http://www.marketingvox.com/archives/2006/12/2...,N,initial-importer,2007-07-04 04:52:57,2008-02-27 23:14:29
1,2,2,c:5,2004-09-01,angel,angel,500000,500000,USD,N,...,N,USD,2,0,1,N,N,initial-importer,2007-05-27 06:08:18,2013-06-28 20:07:23
2,3,3,c:5,2005-05-01,series-a,a,12700000,12700000,USD,115000000,...,N,USD,3,0,0,http://www.techcrunch.com/2007/11/02/jim-breye...,Jim Breyer: Extra $500 Million Round For Faceb...,initial-importer,2007-05-27 06:09:10,2013-06-28 20:07:23
3,4,4,c:5,2006-04-01,series-b,b,27500000,27500000,USD,525000000,...,N,USD,4,0,0,http://www.facebook.com/press/info.php?factsheet,Facebook Funding,initial-importer,2007-05-27 06:09:36,2013-06-28 20:07:24
4,5,5,c:7299,2006-05-01,series-b,b,10500000,10500000,USD,N,...,N,N,2,0,0,http://www.techcrunch.com/2006/05/14/photobuck...,PhotoBucket Closes $10.5M From Trinity Ventures,initial-importer,2007-05-29 11:05:59,2008-04-16 17:09:12


In [4]:
funding_rounds.shape

(52928, 23)

In [5]:
funding_rounds.keys()

Index(['id', 'funding_round_id', 'object_id', 'funded_at',
       'funding_round_type', 'funding_round_code', 'raised_amount_usd',
       'raised_amount', 'raised_currency_code', 'pre_money_valuation_usd',
       'pre_money_valuation', 'pre_money_currency_code',
       'post_money_valuation_usd', 'post_money_valuation',
       'post_money_currency_code', 'participants', 'is_first_round',
       'is_last_round', 'source_url', 'source_description', 'created_by',
       'created_at', 'updated_at '],
      dtype='object')

In [6]:
funding_rounds.dtypes

id                           int64
funding_round_id             int64
object_id                   object
funded_at                   object
funding_round_type          object
funding_round_code          object
raised_amount_usd           object
raised_amount               object
raised_currency_code        object
pre_money_valuation_usd     object
pre_money_valuation         object
pre_money_currency_code     object
post_money_valuation_usd    object
post_money_valuation        object
post_money_currency_code    object
participants                 int64
is_first_round               int64
is_last_round                int64
source_url                  object
source_description          object
created_by                  object
created_at                  object
updated_at                  object
dtype: object

In [7]:
funding_rounds.describe()

Unnamed: 0,id,funding_round_id,participants,is_first_round,is_last_round
count,52928.0,52928.0,52928.0,52928.0,52928.0
mean,28962.894536,28962.894536,1.528567,0.604576,0.604538
std,16821.871803,16821.871803,2.060192,0.488946,0.488954
min,1.0,1.0,0.0,0.0,0.0
25%,14343.75,14343.75,0.0,0.0,0.0
50%,28885.5,28885.5,1.0,1.0,1.0
75%,43561.25,43561.25,2.0,1.0,1.0
max,57952.0,57952.0,36.0,1.0,1.0


In [8]:
objects.head()

Unnamed: 0,id,entity_type,entity_id,parent_id,name,normalized_name,permalink,category_code,status,founded_at,...,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,created_by,created_at,updated_at
0,c:1,Company,1,N,Wetpaint,wetpaint,/company/wetpaint,web,operating,2005-10-17,...,2008-05-19,3,39750000,2010-09-05,2013-09-18,5,17,initial-importer,2007-05-25 06:51:27,2013-04-13 03:29:00
1,c:10,Company,10,N,Flektor,flektor,/company/flektor,games_video,acquired,N,...,N,N,N,N,N,N,6,initial-importer,2007-05-31 21:11:51,2008-05-23 23:23:14
2,c:100,Company,100,N,There,there,/company/there,games_video,acquired,N,...,N,N,N,2003-02-01,2011-09-23,4,12,initial-importer,2007-08-06 23:52:45,2013-11-04 02:09:48
3,c:10000,Company,10000,N,MYWEBBO,mywebbo,/company/mywebbo,network_hosting,operating,2008-07-26,...,N,N,N,N,N,N,N,N,2008-08-24 16:51:57,2008-09-06 14:19:18
4,c:10001,Company,10001,N,THE Movie Streamer,the movie streamer,/company/the-movie-streamer,games_video,operating,2008-07-26,...,N,N,N,N,N,N,N,N,2008-08-24 17:10:34,2008-09-06 14:19:18


In [9]:
objects.shape

(462651, 40)

In [10]:
objects.keys()

Index(['id', 'entity_type', 'entity_id', 'parent_id', 'name',
       'normalized_name', 'permalink', 'category_code', 'status', 'founded_at',
       'closed_at', 'domain', 'homepage_url', 'twitter_username', 'logo_url',
       'logo_width', 'logo_height', 'short_description', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships', 'created_by',
       'created_at', 'updated_at'],
      dtype='object')

In [11]:
objects.dtypes

id                     object
entity_type            object
entity_id               int64
parent_id              object
name                   object
normalized_name        object
permalink              object
category_code          object
status                 object
founded_at             object
closed_at              object
domain                 object
homepage_url           object
twitter_username       object
logo_url               object
logo_width             object
logo_height            object
short_description      object
description            object
overview               object
tag_list               object
country_code           object
state_code             object
city                   object
region                 object
first_investment_at    object
last_investment_at     object
investment_rounds      object
invested_companies     object
first_funding_at       object
last_funding_at        object
funding_rounds         object
funding_total_usd      object
first_mile

## One common occurence within the datasets is the `N` listed in various columns. I am going to opt out of removing any columns with this entry. When having a closer look the N is placed in spaces for which companies on the crunchbase dataset choose not to disclose certain information within columns. 

# Cleaning & Preprocessing

In [12]:
funding_rounds.head()

Unnamed: 0,id,funding_round_id,object_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,raised_amount,raised_currency_code,pre_money_valuation_usd,...,post_money_valuation,post_money_currency_code,participants,is_first_round,is_last_round,source_url,source_description,created_by,created_at,updated_at
0,1,1,c:4,2006-12-01,series-b,b,8500000,8500000,USD,N,...,N,N,2,0,0,http://www.marketingvox.com/archives/2006/12/2...,N,initial-importer,2007-07-04 04:52:57,2008-02-27 23:14:29
1,2,2,c:5,2004-09-01,angel,angel,500000,500000,USD,N,...,N,USD,2,0,1,N,N,initial-importer,2007-05-27 06:08:18,2013-06-28 20:07:23
2,3,3,c:5,2005-05-01,series-a,a,12700000,12700000,USD,115000000,...,N,USD,3,0,0,http://www.techcrunch.com/2007/11/02/jim-breye...,Jim Breyer: Extra $500 Million Round For Faceb...,initial-importer,2007-05-27 06:09:10,2013-06-28 20:07:23
3,4,4,c:5,2006-04-01,series-b,b,27500000,27500000,USD,525000000,...,N,USD,4,0,0,http://www.facebook.com/press/info.php?factsheet,Facebook Funding,initial-importer,2007-05-27 06:09:36,2013-06-28 20:07:24
4,5,5,c:7299,2006-05-01,series-b,b,10500000,10500000,USD,N,...,N,N,2,0,0,http://www.techcrunch.com/2006/05/14/photobuck...,PhotoBucket Closes $10.5M From Trinity Ventures,initial-importer,2007-05-29 11:05:59,2008-04-16 17:09:12


In [13]:
funding_rounds.dtypes

id                           int64
funding_round_id             int64
object_id                   object
funded_at                   object
funding_round_type          object
funding_round_code          object
raised_amount_usd           object
raised_amount               object
raised_currency_code        object
pre_money_valuation_usd     object
pre_money_valuation         object
pre_money_currency_code     object
post_money_valuation_usd    object
post_money_valuation        object
post_money_currency_code    object
participants                 int64
is_first_round               int64
is_last_round                int64
source_url                  object
source_description          object
created_by                  object
created_at                  object
updated_at                  object
dtype: object

My first step is to remove any currencies/columns that are not in USD. Most companies even outside of the U.S. choose to raise funds in dollars. Another common occurence that I see within the dataset is that most of the companies listed here are based in the U.S. This is important to InReach, given that InReach aims to change the market and make venture capital more global and standardized across the UK & EU. 

In [14]:
funding_rounds = funding_rounds.drop(['raised_amount', 'raised_currency_code',
                                     'pre_money_valuation', 'pre_money_currency_code', 
                                      'post_money_valuation', 'post_money_currency_code',
                                     'source_url', 'source_description', 'updated_at '], axis =1)

In [15]:
funding_rounds.head()

Unnamed: 0,id,funding_round_id,object_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,pre_money_valuation_usd,post_money_valuation_usd,participants,is_first_round,is_last_round,created_by,created_at
0,1,1,c:4,2006-12-01,series-b,b,8500000,N,N,2,0,0,initial-importer,2007-07-04 04:52:57
1,2,2,c:5,2004-09-01,angel,angel,500000,N,N,2,0,1,initial-importer,2007-05-27 06:08:18
2,3,3,c:5,2005-05-01,series-a,a,12700000,115000000,N,3,0,0,initial-importer,2007-05-27 06:09:10
3,4,4,c:5,2006-04-01,series-b,b,27500000,525000000,N,4,0,0,initial-importer,2007-05-27 06:09:36
4,5,5,c:7299,2006-05-01,series-b,b,10500000,N,N,2,0,0,initial-importer,2007-05-29 11:05:59


In [16]:
print(funding_rounds.shape)

(52928, 14)


In [20]:
#check to see object_id in terms of uniqueness, this will show us how many companies have listed multiple rounds on
#this crunchbase dataset
funding_rounds['object_id'].nunique()

31939

This shows that 31,939 entries are different. My assumption here would be that the remaining amount of data entries are from the same companines on different funding rounds. Now that I have seen the number of unique `object_id`, next I would like to query which `object_id` have made it to series A and obtain the sum. 

In [31]:
obtained_series_a = funding_rounds.query('funding_round_type == "series-a"').groupby(['object_id']).count()
obtained_series_a


Unnamed: 0_level_0,id,funding_round_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,pre_money_valuation_usd,post_money_valuation_usd,participants,is_first_round,is_last_round,created_by,created_at
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
c:1,1,1,1,1,1,1,1,1,1,1,1,1,1
c:1001,1,1,1,1,1,1,1,1,1,1,1,1,1
c:10015,1,1,1,1,1,1,1,1,1,1,1,1,1
c:100271,1,1,1,1,1,1,1,1,1,1,1,1,1
c:1003,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
c:993,1,1,1,1,1,1,1,1,1,1,1,1,1
c:994,1,1,1,1,1,1,1,1,1,1,1,1,1
c:99685,1,1,1,1,1,1,1,1,1,1,1,1,1
c:9972,1,1,1,1,1,1,1,1,1,1,1,1,1


In [25]:
obtained_series_a.sum()

id                          9873
funding_round_id            9873
funded_at                   9873
funding_round_type          9873
funding_round_code          9873
raised_amount_usd           9873
pre_money_valuation_usd     9873
post_money_valuation_usd    9873
participants                9873
is_first_round              9873
is_last_round               9873
created_by                  9873
created_at                  9873
dtype: int64

In [37]:
#who has raised series angel, convertible, seed
funding_rounds['raised_before_a'] = np.where((funding_rounds['funding_round_type'] == 'angel') |
                                      (funding_rounds['funding_round_type'] == 'convertible') |
                                      (funding_rounds['funding_round_type'] == 'seed'), 1, 0) 

In [40]:
#who has raised series a
funding_rounds['raised_series_a'] = np.where((funding_rounds['funding_round_type'] == 'series-a'), 1, 0)

In [38]:
funding_rounds.head()

Unnamed: 0,id,funding_round_id,object_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,pre_money_valuation_usd,post_money_valuation_usd,participants,is_first_round,is_last_round,created_by,created_at,raised_a,raised_before_a
0,1,1,c:4,2006-12-01,series-b,b,8500000,N,N,2,0,0,initial-importer,2007-07-04 04:52:57,0,0
1,2,2,c:5,2004-09-01,angel,angel,500000,N,N,2,0,1,initial-importer,2007-05-27 06:08:18,1,1
2,3,3,c:5,2005-05-01,series-a,a,12700000,115000000,N,3,0,0,initial-importer,2007-05-27 06:09:10,0,0
3,4,4,c:5,2006-04-01,series-b,b,27500000,525000000,N,4,0,0,initial-importer,2007-05-27 06:09:36,0,0
4,5,5,c:7299,2006-05-01,series-b,b,10500000,N,N,2,0,0,initial-importer,2007-05-29 11:05:59,0,0
