In [2]:
import pandas as pd
import numpy as np

import base64
from io import BytesIO

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

import datetime
from datetime import datetime
from datetime import timedelta
from dateutil import rrule
import time

from matplotlib import font_manager as fm


from pandas.tseries.offsets import DateOffset


sns.set_style("whitegrid")
sns.set_theme(style="whitegrid")
sns.color_palette("gist_rainbow", as_cmap=True)
plt.rcParams["font.family"] = "Verdana"

custom_params = {"axes.spines.right": False, "axes.spines.top": False}

### Disclaimer & Sources

This data was extracted from CrunchBase on December 02, 2014.  
Read more at:
- http://info.crunchbase.com/about/crunchbase-data-exports/

By using this data, you agree to follow the CrunchBase Terms of Service and Licensing Policy:
- http://info.crunchbase.com/docs/terms-of-service/
- http://info.crunchbase.com/docs/privacy-policy/

In [12]:
companies = pd.read_csv('companies.csv')
investments = pd.read_csv('investments.csv')
acquisitions = pd.read_csv('acquisitions.csv')


  acquisitions = pd.read_csv('acquisitions.csv')


### "Companies" Dataset

In [18]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54294 entries, 0 to 54293
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   permalink            49438 non-null  object 
 1   name                 49437 non-null  object 
 2   homepage_url         45989 non-null  object 
 3   category_list        45477 non-null  object 
 4    market              45470 non-null  object 
 5    funding_total_usd   49438 non-null  object 
 6   status               48124 non-null  object 
 7   country_code         44165 non-null  object 
 8   state_code           30161 non-null  object 
 9   region               44165 non-null  object 
 10  city                 43322 non-null  object 
 11  funding_rounds       49438 non-null  float64
 12  founded_at           38554 non-null  object 
 13  founded_month        38482 non-null  object 
 14  founded_quarter      38482 non-null  object 
 15  founded_year         38482 non-null 

In [19]:
companies.head(5)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at
0,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,News,1750000,acquired,USA,NY,New York City,New York,1.0,2012-06-01,2012-06,2012-Q2,2012.0,2012-06-30,2012-06-30
1,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,Los Angeles,2.0,,,,,2010-06-04,2010-09-23
2,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Publishing,40000,operating,EST,,Tallinn,Tallinn,1.0,2012-10-26,2012-10,2012-Q4,2012.0,2012-08-09,2012-08-09
3,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Electronics,1500000,operating,GBR,,London,London,1.0,2011-04-01,2011-04,2011-Q2,2011.0,2011-04-01,2011-04-01
4,/organization/r-ranch-and-mine,-R- Ranch and Mine,,|Tourism|Entertainment|Games|,Tourism,60000,operating,USA,TX,Dallas,Fort Worth,2.0,2014-01-01,2014-01,2014-Q1,2014.0,2014-08-17,2014-09-26


### "Investments" Dataset

In [13]:
investments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114506 entries, 0 to 114505
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   company_permalink        114506 non-null  object
 1   company_name             114505 non-null  object
 2   company_category_list    111242 non-null  object
 3   company_market           111240 non-null  object
 4   company_country_code     107147 non-null  object
 5   company_state_code       79158 non-null   object
 6   company_region           107147 non-null  object
 7   company_city             105801 non-null  object
 8   investor_permalink       114440 non-null  object
 9   investor_name            114440 non-null  object
 10  investor_category_list   30507 non-null   object
 11  investor_market          30455 non-null   object
 12  investor_country_code    86521 non-null   object
 13  investor_state_code      62274 non-null   object
 14  investor_region     

In [22]:
investments.head(5)

Unnamed: 0,company_permalink,company_name,company_category_list,company_market,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,...,investor_region,investor_city,funding_round_permalink,funding_round_type,funding_round_code,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/organization/test-company-3,test company,,,ARE,,Dubai,Dubai,/person/jamessss-bondddd,jamessss bondddd,...,,,/funding-round/c308019016ead7afb2a1d117018eb6fc,seed,,1921-09-01,1921-09,1921-Q3,1921,1000.0
1,/organization/andrewburnett-com-ltd,AndrewBurnett.com Ltd,|Internet|SEO|Services|Public Relations|Social...,Internet,GBR,,Edinburgh,Edinburgh,/organization/ekaf,Ekaf,...,,,/funding-round/14fe2864e02d0f15ddc3ec8eacdc8e1b,seed,,1974-01-01,1974-01,1974-Q1,1974,
2,/organization/abo-data,ABO Data,|Enterprise Software|,Enterprise Software,USA,TX,TX - Other,Italy,/person/antonio-murroni,ANTONIO MURRONI,...,,,/funding-round/809e211b969c3f66440fc15ffcd29385,seed,,1979-01-01,1979-01,1979-Q1,1979,1000000.0
3,/organization/abo-data,ABO Data,|Enterprise Software|,Enterprise Software,USA,TX,TX - Other,Italy,/person/filippo-murroni,FILIPPO Murroni,...,,,/funding-round/809e211b969c3f66440fc15ffcd29385,seed,,1979-01-01,1979-01,1979-Q1,1979,1000000.0
4,/organization/ikro,Ikro,,,BRA,,BRA - Other,Canoas,/organization/crp-companhia-de-participacoes,CRP Companhia de Participações,...,,,/funding-round/46c353a8249170cc4b6ab89a522fefdc,venture,A,1982-06-01,1982-06,1982-Q2,1982,724000.0


### "Acquisitions" Dataset

In [20]:
acquisitions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55240 entries, 0 to 55239
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   company_permalink       13070 non-null  object 
 1   company_name            13070 non-null  object 
 2   company_category_list   9787 non-null   object 
 3   company_market          9785 non-null   object 
 4   company_country_code    9783 non-null   object 
 5   company_state_code      7519 non-null   object 
 6   company_region          9784 non-null   object 
 7   company_city            9585 non-null   object 
 8   acquirer_permalink      13070 non-null  object 
 9   acquirer_name           13070 non-null  object 
 10  acquirer_category_list  11565 non-null  object 
 11  acquirer_market         11559 non-null  object 
 12  acquirer_country_code   12229 non-null  object 
 13  acquirer_state_code     9782 non-null   object 
 14  acquirer_region         12229 non-null

In [23]:
acquisitions.head(5)

Unnamed: 0,company_permalink,company_name,company_category_list,company_market,company_country_code,company_state_code,company_region,company_city,acquirer_permalink,acquirer_name,...,acquirer_country_code,acquirer_state_code,acquirer_region,acquirer_city,acquired_at,acquired_month,acquired_quarter,acquired_year,price_amount,price_currency_code
0,/organization/waywire,#waywire,|Entertainment|Politics|Social Media|News|,News,USA,NY,New York City,New York,/organization/magnify,Waywire Networks,...,USA,NY,New York City,New York,2013-10-17,2013-10,2013-Q4,2013.0,,USD
1,/organization/fluff-friends,(fluff)Friends,,,,,,,/organization/social-gaming-network,SGN (Social Gaming Network),...,USA,CA,Los Angeles,Beverly Hills,2008-09-16,2008-09,2008-Q3,2008.0,,USD
2,/organization/red,(RED),|Nonprofits|,Nonprofits,USA,NY,New York City,New York,/organization/nationstar-mortgage-holdings,Nationstar Mortgage Holdings,...,USA,TX,Dallas,Lewisville,2014-05-08,2014-05,2014-Q2,2014.0,18000000.0,USD
3,/organization/vandaele-holdings,.,,,,,,,/organization/hi7e,HI7E,...,USA,FL,Palm Beaches,West Palm Beach,2011-01-01,2011-01,2011-Q1,2011.0,,USD
4,/organization/co-internet,.CO,|Registrars|Domains|Curated Web|,Registrars,USA,FL,Miami,Miami,/organization/neustar,Neustar,...,USA,VA,"Washington, D.C.",Sterling,2014-03-20,2014-03,2014-Q1,2014.0,109000000.0,USD
