In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import the numpy and pandas packages

import numpy as np
import pandas as pd

In [3]:
# Displaying float upto 2 decimal points

pd.set_option('display.float_format', '{:.2f}'.format)

## Task 1: Loading and cleaning Data

-  ### Subtask 1.1: Import and read companies data.

Import and read the companies data. Store it in a dataframe called `companies`.

In [4]:
companies = pd.read_csv('companies.txt', sep='\t', encoding='ISO-8859-1')

#Inspecting companies dataframe

print(companies.shape)

print(companies.info())

companies.describe()

(66368, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 10 columns):
permalink        66368 non-null object
name             66367 non-null object
homepage_url     61310 non-null object
category_list    63220 non-null object
status           66368 non-null object
country_code     59410 non-null object
state_code       57821 non-null object
region           58338 non-null object
city             58340 non-null object
founded_at       51147 non-null object
dtypes: object(10)
memory usage: 5.1+ MB
None


Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
count,66368,66367,61310,63220,66368,59410,57821,58338,58340,51147
unique,66368,66102,61191,27296,4,137,311,1092,5111,3978
top,/Organization/Comic-Rocket,Blink,http://www.askforoffer.com,Software,operating,USA,CA,SF Bay Area,San Francisco,01-01-2012
freq,1,4,5,3995,53034,37601,12900,8804,3526,2730


In [5]:
# On inspecting the data from the above info and describe we can see the permalink column
# is having same number of rows as the number of elements in the dataframe. Hence it can 
# be used as key.This finding can be used later for drawing further inferences. 

companies.head(10) 


Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/Organization/-Fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/Organization/-Qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/Organization/-The-One-Of-Them-Inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/Organization/0-6-Com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/Organization/004-Technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
5,/Organization/01Games-Technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
6,/Organization/0Ndine-Biomedical-Inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
7,/Organization/0Xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011
8,/Organization/1,One Inc.,http://whatis1.com,Mobile,operating,USA,CA,SF Bay Area,San Francisco,01-08-2011
9,/Organization/1-2-3-Listo,"1,2,3 Listo",http://www.123listo.com,E-Commerce,operating,CHL,12,Santiago,Las Condes,01-01-2012


In [6]:
# The above inspection shows that there are still some special charecters included. Hence,
# for non-erroneous results , the encoding problem needs to be handled. 

companies['permalink'] = companies['permalink'].str.encode('utf-8').str.decode('ascii', 'ignore')

companies.head(10)

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/Organization/-Fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/Organization/-Qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/Organization/-The-One-Of-Them-Inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/Organization/0-6-Com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/Organization/004-Technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
5,/Organization/01Games-Technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
6,/Organization/0Ndine-Biomedical-Inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
7,/Organization/0Xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011
8,/Organization/1,One Inc.,http://whatis1.com,Mobile,operating,USA,CA,SF Bay Area,San Francisco,01-08-2011
9,/Organization/1-2-3-Listo,"1,2,3 Listo",http://www.123listo.com,E-Commerce,operating,CHL,12,Santiago,Las Condes,01-01-2012




-  ### Subtask 1.2: Import and read rounds2 data.

Import and read the round_2 data. Store it in a dataframe called `rounds2`.

In [7]:
rounds2 = pd.read_csv('rounds2.csv', sep=',', encoding='ISO-8859-1')

# Inspecting rounds2 dataframe

print(rounds2.shape)

print(rounds2.info())

rounds2.describe()


(114949, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114949 entries, 0 to 114948
Data columns (total 6 columns):
company_permalink          114949 non-null object
funding_round_permalink    114949 non-null object
funding_round_type         114949 non-null object
funding_round_code         31140 non-null object
funded_at                  114949 non-null object
raised_amount_usd          94959 non-null float64
dtypes: float64(1), object(5)
memory usage: 5.3+ MB
None


Unnamed: 0,raised_amount_usd
count,94959.0
mean,10426869.33
std,114821247.98
min,0.0
25%,322500.0
50%,1680511.0
75%,7000000.0
max,21271935000.0


In [8]:
rounds2.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/ORGANIZATION/-QOUNTER,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/ORGANIZATION/-THE-ONE-OF-THEM-INC-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0
5,/ORGANIZATION/004-TECHNOLOGIES,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0
7,/ORGANIZATION/0NDINE-BIOMEDICAL-INC,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0
9,/ORGANIZATION/0XDATA,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0


In [9]:
# The above inspection shows that there are still some special charecters included. 
# Hence, for non-erroneous results, the encoding problem needs to be handled. 

rounds2['company_permalink'] = rounds2['company_permalink'].str.encode('utf-8').str.decode('ascii', 'ignore')

rounds2.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/ORGANIZATION/-QOUNTER,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/ORGANIZATION/-THE-ONE-OF-THEM-INC-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0
5,/ORGANIZATION/004-TECHNOLOGIES,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0
7,/ORGANIZATION/0NDINE-BIOMEDICAL-INC,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0
9,/ORGANIZATION/0XDATA,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0


In [10]:
# From the above 2 dataframes we can see that permalink and company permalink can be 
# used for joining. However, first it needs to be formatted properly.

rounds2['company_permalink'] = rounds2['company_permalink'].str.lower()

companies['permalink'] = companies['permalink'].str.lower()

-  ### Subtask 1.3: Print unique records present in both the dataframes and identify the key column for companies.

Print count of unique records in `companies` and `rounds2`.

In [11]:
# Finding the key column. Both row count and unique values should be same.

print(companies.shape[0])

print(companies['permalink'].nunique())

print(companies['permalink'].is_unique) #if true than can be used as unique key column. 
# From here we conculde this as unique key column.

print(rounds2['company_permalink'].nunique())

66368
66368
True
66368


-  ### Subtask 1.4: Find if there are companies in the rounds2 file which are not present in companies.

Get the difference of `rounds2` and `companies`.

In [12]:
# From above we can observe that both dataframes have same count of records. 
# But these might be different. Hence we need to check if all companies present 
# in 'rounds2' are in 'companies'or not.


# To compare the values of permalink column of 'companies' with the 
# company_permalink column of 'rounds2'
unique_comp_companies = set(companies['permalink']) 
unique_comp_rounds2 = set(rounds2['company_permalink']) 


# printing the total number of elements after dropping the duplicates.
print(len(unique_comp_companies))
print(len(unique_comp_rounds2))


print(len(unique_comp_rounds2.difference(unique_comp_companies))) #if this is 0 then it means 
# that there is no company in the rounds2 file which is not present in companies

66368
66368
0


-  ### Subtask 1.5: Create master_frame by joining companies and rounds2.

Joining `companies` and `rounds2` dataframe to create `master_frame`.

In [13]:
master_frame = pd.merge(rounds2, companies, left_on=['company_permalink'], right_on =['permalink'] , how='inner')

# Inspecting master_frame

print(master_frame.shape)

print(companies.info())

master_frame.head(10)

(114949, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 10 columns):
permalink        66368 non-null object
name             66367 non-null object
homepage_url     61310 non-null object
category_list    63220 non-null object
status           66368 non-null object
country_code     59410 non-null object
state_code       57821 non-null object
region           58338 non-null object
city             58340 non-null object
founded_at       51147 non-null object
dtypes: object(10)
memory usage: 5.1+ MB
None


Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/organization/-qounter,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
3,/organization/-the-one-of-them-inc-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
5,/organization/004-technologies,/funding-round/1278dd4e6a37fa4b7d7e06c21b3c1830,venture,,24-07-2014,,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0,/organization/01games-technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,
7,/organization/0ndine-biomedical-inc,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0,/organization/0ndine-biomedical-inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997
9,/organization/0xdata,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011


In [14]:
# To find the total number of null values for each column in master_frame.

print(master_frame.isnull().sum())


company_permalink              0
funding_round_permalink        0
funding_round_type             0
funding_round_code         83809
funded_at                      0
raised_amount_usd          19990
permalink                      0
name                           1
homepage_url                6134
category_list               3410
status                         0
country_code                8678
state_code                 10946
region                     10167
city                       10164
founded_at                 20521
dtype: int64


In [15]:
# Getting the column-wise null percentage in master_frame

print(round(100*(master_frame.isnull().sum()/len(master_frame.index)),2)) 


# The raised_amount_usd column has 17.4% of the records missing. 
# The country_code column has 7.55% of the records missing.
# The category_list column has 2.97% of the records missing.


company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        72.91
funded_at                  0.00
raised_amount_usd         17.39
permalink                  0.00
name                       0.00
homepage_url               5.34
category_list              2.97
status                     0.00
country_code               7.55
state_code                 9.52
region                     8.84
city                       8.84
founded_at                17.85
dtype: float64


## Task 2: Finding the Average values of investment for funding types

-  ### Subtask 2.1: Cleaning master_frame.

Cleaning master_frame by removing rows with missing vales in raised_amount_usd column as raised_amount_usd column is having 17.39% missing values

In [16]:
# We have observed earlier that raised_amount_usd column has 17.4% of the records missing. 
# Hence, removing rows with missing values in raised_amount_usd column.

master_frame = master_frame[~np.isnan(master_frame['raised_amount_usd'])]   


# Getting the column-wise null percentage
round(100*(master_frame.isnull().sum()/len(master_frame.index)),2) 

company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        70.34
funded_at                  0.00
raised_amount_usd          0.00
permalink                  0.00
name                       0.00
homepage_url               4.56
category_list              1.10
status                     0.00
country_code               6.16
state_code                 8.01
region                     7.42
city                       7.42
founded_at                16.81
dtype: float64

-  ### Subtask 2.2: Find the average amount raised based on funding round type.

Using master_frame to groupby on funding_round_type and calculating average raised fund amount.

In [17]:
# Finding average investment for each funding round type
avg_invest_by_funding_round = pd.DataFrame(master_frame.groupby('funding_round_type').raised_amount_usd.mean())

# Finding median investment for each funding round type
invest_by_funding_round_median = pd.DataFrame(master_frame.groupby('funding_round_type').raised_amount_usd.median())

print(avg_invest_by_funding_round)

print(invest_by_funding_round_median)


                       raised_amount_usd
funding_round_type                      
angel                          958694.47
convertible_note              1453438.54
debt_financing               17043526.02
equity_crowdfunding            538368.21
grant                         4300576.34
non_equity_assistance          411203.05
post_ipo_debt               168704571.82
post_ipo_equity              82182493.87
private_equity               73308593.03
product_crowdfunding          1363131.07
secondary_market             79649630.10
seed                           719818.00
undisclosed                  19242370.23
venture                      11748949.13
                       raised_amount_usd
funding_round_type                      
angel                          400000.00
convertible_note               272000.00
debt_financing                1100000.00
equity_crowdfunding            100000.00
grant                          201684.00
non_equity_assistance           60000.00
post_ipo_debt   

-  ### Subtask 2.3: Find the funding round type where average investment per round is between 5 to 15 million USD per investment round.

Using avg_invest_by_funding_round to find funding round type where average investment per round is between 5 to 15 million USD per investment round.

In [18]:
avg_invest_by_funding_round.loc[(avg_invest_by_funding_round.raised_amount_usd>=5000000) & (avg_invest_by_funding_round.raised_amount_usd<=15000000)]

# Range specified is 5 to 15 million USD. So from above we can take that only venture funding round 
# is suitable for Spark funds.
# Also, we can see the median value for venture also lies between 5 to 15 million USD.

Unnamed: 0_level_0,raised_amount_usd
funding_round_type,Unnamed: 1_level_1
venture,11748949.13


## Task 3: Finding the Countries where most investment are occuring for venture funding type

-  ### Subtask 3.1: Cleaning and Filtering master_frame for venture funding type



In [19]:
# We have observed earlier that country_code column has 7.55% missing values.
# Hence, Removing rows with missing values in country_code column.
master_frame = master_frame[~master_frame['country_code'].isnull()]   


# Getting the column-wise null percentage
print(round(100*(master_frame.isnull().sum()/len(master_frame.index)),2))

company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        69.77
funded_at                  0.00
raised_amount_usd          0.00
permalink                  0.00
name                       0.00
homepage_url               3.79
category_list              0.65
status                     0.00
country_code               0.00
state_code                 1.97
region                     1.34
city                       1.34
founded_at                15.33
dtype: float64


In [20]:
# Filtering master_frame to get those investments only where funding round type is venture
venture_funding = master_frame.loc[(master_frame['funding_round_type']=='venture')]


# Grouping the venture_funding dataframe using the country_code column and 
# getting total raised_amount_usd per country
venture_funding_by_country = venture_funding.groupby('country_code').raised_amount_usd.sum()


# Taking top nine countries which have received the highest total funding 
# (across ALL sectors for the venture investment type)
top9 = pd.DataFrame(venture_funding_by_country).sort_values('raised_amount_usd', ascending=False).head(9)
top9

Unnamed: 0_level_0,raised_amount_usd
country_code,Unnamed: 1_level_1
USA,422510842796.0
CHN,39835418773.0
GBR,20245627416.0
IND,14391858718.0
CAN,9583332317.0
FRA,7259536732.0
ISR,6907514579.0
DEU,6346959822.0
JPN,3363676611.0


In [21]:
# We can inspect the codes in the above list manually with data available over internet
# and map them to the country corresponding to it.

# We can observe that :-
# USA is at the top and is an English Speaking Country. 
# CHN is second but is not an English Speaking Country based on the list above. 
# GBR will be taken as Second as it is an English Speaking Country.
# IND will be taken as Third as it is an English Speaking Country.

# Finding the top 3 english speaking countries.
top3_english = top9.loc[['USA','GBR','IND']]
top3_english

Unnamed: 0_level_0,raised_amount_usd
country_code,Unnamed: 1_level_1
USA,422510842796.0
GBR,20245627416.0
IND,14391858718.0


## Task 4: Sector Analysis

-  ### Subtask 4.1: Cleaning and Filtering master_frame
Create new columns primary_sector and other_sectors from category_list



In [22]:
# We observed earlier that category_list column has 2.97% values missing 

# Removing rows where category_list column has missing values
master_frame = master_frame[~master_frame['category_list'].isnull()]   

# Getting the column-wise null percentage
print(round(100*(master_frame.isnull().sum()/len(master_frame.index)),2))


company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        69.75
funded_at                  0.00
raised_amount_usd          0.00
permalink                  0.00
name                       0.00
homepage_url               3.67
category_list              0.00
status                     0.00
country_code               0.00
state_code                 1.96
region                     1.34
city                       1.33
founded_at                15.10
dtype: float64


In [23]:
# extracting the primary sector from category_list

master_frame[['primary_sector', 'other_sectors']]= master_frame['category_list'].str.split('|', n=1, expand=True)

master_frame.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at,primary_sector,other_sectors
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,,Media,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014,Application Platforms,Real Time|Social Network Media
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007,Curated Web,
6,/organization/01games-technology,/funding-round/7d53696f2b4f607a2f2a8cbb83d01839,undisclosed,,01-07-2014,41250.0,/organization/01games-technology,01Games Technology,http://www.01games.hk/,Games,operating,HKG,,Hong Kong,Hong Kong,,Games,
7,/organization/0ndine-biomedical-inc,/funding-round/2b9d3ac293d5cdccbecff5c8cb0f327d,seed,,11-09-2009,43360.0,/organization/0ndine-biomedical-inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997,Biotechnology,
8,/organization/0ndine-biomedical-inc,/funding-round/954b9499724b946ad8c396a57a5f3b72,venture,,21-12-2009,719491.0,/organization/0ndine-biomedical-inc,Ondine Biomedical Inc.,http://ondinebio.com,Biotechnology,operating,CAN,BC,Vancouver,Vancouver,01-01-1997,Biotechnology,
9,/organization/0xdata,/funding-round/383a9bd2c04f7038bb543ccef5ba3eae,seed,,22-05-2013,3000000.0,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011,Analytics,
10,/organization/0xdata,/funding-round/3bb2ee4a2d89251a10aaa735b1180e44,venture,B,09-11-2015,20000000.0,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011,Analytics,
11,/organization/0xdata,/funding-round/ae2a174c06517c2394aed45006322a7e,venture,,03-01-2013,1700000.0,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011,Analytics,
12,/organization/0xdata,/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e,venture,A,19-07-2014,8900000.0,/organization/0xdata,H2O.ai,http://h2o.ai/,Analytics,operating,USA,CA,SF Bay Area,Mountain View,01-01-2011,Analytics,


-  ### Subtask 4.2: Reading mapping.csv
Read mapping.csv and transform the data as required.

In [24]:
# importing the csv file into sector_mapping dataframe

sector_mapping = pd.read_csv('mapping.csv', encoding = "ISO-8859-1")  

sector_mapping.head(15)

Unnamed: 0,category_list,Automotive & Sports,Blanks,Cleantech / Semiconductors,Entertainment,Health,Manufacturing,"News, Search and Messaging",Others,"Social, Finance, Analytics, Advertising"
0,,0,1,0,0,0,0,0,0,0
1,3D,0,0,0,0,0,1,0,0,0
2,3D Printing,0,0,0,0,0,1,0,0,0
3,3D Technology,0,0,0,0,0,1,0,0,0
4,Accounting,0,0,0,0,0,0,0,0,1
5,Active Lifestyle,0,0,0,0,1,0,0,0,0
6,Ad Targeting,0,0,0,0,0,0,0,0,1
7,Advanced Materials,0,0,0,0,0,1,0,0,0
8,Adventure Travel,1,0,0,0,0,0,0,0,0
9,Advertising,0,0,0,0,0,0,0,0,1


In [25]:
# Getting the column-wise null percentage in sector_mapping dataframe

print(round(100*(sector_mapping.isnull().sum()/len(sector_mapping.index)),2))

category_list                             0.15
Automotive & Sports                       0.00
Blanks                                    0.00
Cleantech / Semiconductors                0.00
Entertainment                             0.00
Health                                    0.00
Manufacturing                             0.00
News, Search and Messaging                0.00
Others                                    0.00
Social, Finance, Analytics, Advertising   0.00
dtype: float64


In [26]:
# As category_list has 0.15 % missing values,
# Hence, removing rows with missing values in category_list column.
sector_mapping = sector_mapping[~sector_mapping['category_list'].isnull()]   


# Getting the column-wise null percentage
print(round(100*(sector_mapping.isnull().sum()/len(sector_mapping.index)),2) )

category_list                             0.00
Automotive & Sports                       0.00
Blanks                                    0.00
Cleantech / Semiconductors                0.00
Entertainment                             0.00
Health                                    0.00
Manufacturing                             0.00
News, Search and Messaging                0.00
Others                                    0.00
Social, Finance, Analytics, Advertising   0.00
dtype: float64


In [27]:
# on looking into csv, we need to replace '0's with 'na' in the category_list.
# e.g. A0lytics should become Analytics
sector_mapping['category_list'] = sector_mapping['category_list'].str.replace('0','na')

# We can see that there are a total of 8 primary_sectors including others.

In [28]:
# Transforming the sector_mapping to get only the category_list and its corresponding main sector

# In sector_mapping, finding the main sector(where column value=1) for each category_list 
# and putting in main_sector column
sector_mapping['main_sector'] = sector_mapping[sector_mapping.columns.difference(['category_list'])].idxmax(axis=1)

sector_mapping.head(10)

Unnamed: 0,category_list,Automotive & Sports,Blanks,Cleantech / Semiconductors,Entertainment,Health,Manufacturing,"News, Search and Messaging",Others,"Social, Finance, Analytics, Advertising",main_sector
1,3D,0,0,0,0,0,1,0,0,0,Manufacturing
2,3D Printing,0,0,0,0,0,1,0,0,0,Manufacturing
3,3D Technology,0,0,0,0,0,1,0,0,0,Manufacturing
4,Accounting,0,0,0,0,0,0,0,0,1,"Social, Finance, Analytics, Advertising"
5,Active Lifestyle,0,0,0,0,1,0,0,0,0,Health
6,Ad Targeting,0,0,0,0,0,0,0,0,1,"Social, Finance, Analytics, Advertising"
7,Advanced Materials,0,0,0,0,0,1,0,0,0,Manufacturing
8,Adventure Travel,1,0,0,0,0,0,0,0,0,Automotive & Sports
9,Advertising,0,0,0,0,0,0,0,0,1,"Social, Finance, Analytics, Advertising"
10,Advertising Exchanges,0,0,0,0,0,0,0,0,1,"Social, Finance, Analytics, Advertising"


In [29]:
# formatting the sector_mapping further to retain only category_list and its main_sector

sector_mapping = sector_mapping[['category_list','main_sector']]

sector_mapping.head(10)


Unnamed: 0,category_list,main_sector
1,3D,Manufacturing
2,3D Printing,Manufacturing
3,3D Technology,Manufacturing
4,Accounting,"Social, Finance, Analytics, Advertising"
5,Active Lifestyle,Health
6,Ad Targeting,"Social, Finance, Analytics, Advertising"
7,Advanced Materials,Manufacturing
8,Adventure Travel,Automotive & Sports
9,Advertising,"Social, Finance, Analytics, Advertising"
10,Advertising Exchanges,"Social, Finance, Analytics, Advertising"


-  ### Subtask 4.3: Joining master_frame and sector_mapping.
Joining master_frame and sector_mapping and doing the necessary formatting.

In [30]:
master_frame['primary_sector'] = master_frame['primary_sector'].str.lower()

sector_mapping['category_list'] = sector_mapping['category_list'].str.lower()


# Merging masterframe with sector_mapping so that each primary sector gets mapped to its main sector
merged_frame = pd.merge(master_frame, sector_mapping, left_on=['primary_sector'], right_on =['category_list'] , how='left', indicator='True')


# Difference is due to the fact that we did have some primary_sectors in master frame 
# for which no mapped category_list were present in mapping file.
difference_frame = merged_frame[merged_frame['True']!='both']
difference_frame

# From this we can see that there are some primary sectors which are not in the mapping file
# Hence, We can ignore these by merging these dataframes by INNER JOIN. 

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list_x,...,country_code,state_code,region,city,founded_at,primary_sector,other_sectors,category_list_y,main_sector,True
1995,/organization/adora-inc,/funding-round/086235eb44845d9bfdde633ce4c468b1,seed,,30-08-2015,60000.00,/organization/adora-inc,Adora Inc.,http://www.getadora.com,Cause Marketing|Charity,...,USA,AZ,Phoenix,Scottsdale,01-02-2014,cause marketing,Charity,,,left_only
4947,/organization/apollo-commercial-real-estate-fi...,/funding-round/c2cb230111ce9a4acf6fd87b97b8741e,debt_financing,,30-06-2014,50000000.00,/organization/apollo-commercial-real-estate-fi...,Apollo Commercial Real Estate Finance,http://apolloreit.com,Real Estate Investors,...,USA,NY,New York City,New York,,real estate investors,,,,left_only
6358,/organization/asia-translate,/funding-round/0e445561d485e211ccc409be15562dd9,venture,,01-09-2009,20000.00,/organization/asia-translate,Asia Translate,http://www.asiatranslate.net,English-Speaking|Translation,...,SGP,,Singapore,Singapore,03-09-2009,english-speaking,Translation,,,left_only
9758,/organization/bioserie,/funding-round/0bff83a7d549f51caece02a8b8330bc7,seed,,01-07-2014,40000.00,/organization/bioserie,Bioserie,http://www.bioserie.com,Toys,...,HKG,,Hong Kong,Hong Kong,01-11-2009,toys,,,,left_only
9759,/organization/bioserie,/funding-round/202cf002e7587e8142d8ede6bd529722,seed,,01-12-2009,100000.00,/organization/bioserie,Bioserie,http://www.bioserie.com,Toys,...,HKG,,Hong Kong,Hong Kong,01-11-2009,toys,,,,left_only
9760,/organization/bioserie,/funding-round/612627b2ad15df609019e7532e36ea86,seed,,01-07-2010,240000.00,/organization/bioserie,Bioserie,http://www.bioserie.com,Toys,...,HKG,,Hong Kong,Hong Kong,01-11-2009,toys,,,,left_only
10772,/organization/blurr-llc,/funding-round/43462148211db3ca3bc51993bdc0fa7a,seed,,01-01-2013,2370000.00,/organization/blurr-llc,Blurr,http://blurr.com/,Generation Y-Z|Photo Sharing|Social Media,...,USA,CA,Los Angeles,Los Angeles,01-01-2013,generation y-z,Photo Sharing|Social Media,,,left_only
13745,/organization/cartpay-co-,/funding-round/b4b7d1ac0a70d66f147c41b88eb1ab6d,angel,,01-12-2014,30000.00,/organization/cartpay-co-,CartPay Co.,http://cartpay.co,Enterprise Hardware|Retail,...,RUS,66,St. Petersburg,Saint Petersburg,01-06-2014,enterprise hardware,Retail,,,left_only
13746,/organization/cartpay-co-,/funding-round/fbe5ea27c9c23ca0075a0f2bd1be5fba,venture,,01-03-2015,50000.00,/organization/cartpay-co-,CartPay Co.,http://cartpay.co,Enterprise Hardware|Retail,...,RUS,66,St. Petersburg,Saint Petersburg,01-06-2014,enterprise hardware,Retail,,,left_only
13751,/organization/carusele,/funding-round/500d43468b8afcf9753f39243c97e93b,seed,,13-03-2015,250000.00,/organization/carusele,Carusele,http://carusele.com,Social Media Advertising|Social Media Marketing,...,USA,NC,Raleigh,Raleigh,01-01-2015,social media advertising,Social Media Marketing,,,left_only


In [31]:
# Merging master_frame and sector_mapping by INNER JOIN
merged_frame = pd.merge(master_frame, sector_mapping, left_on=['primary_sector'], right_on =['category_list'] , how='inner')

# dropping few unnecessary columns that came after merge
merged_frame = merged_frame.drop(['category_list_y','other_sectors'], axis=1)

# Getting the column-wise null percentage
print(round(100*(merged_frame.isnull().sum()/len(merged_frame.index)),2) )


company_permalink          0.00
funding_round_permalink    0.00
funding_round_type         0.00
funding_round_code        69.73
funded_at                  0.00
raised_amount_usd          0.00
permalink                  0.00
name                       0.00
homepage_url               3.67
category_list_x            0.00
status                     0.00
country_code               0.00
state_code                 1.95
region                     1.34
city                       1.33
founded_at                15.10
primary_sector             0.00
main_sector                0.00
dtype: float64


In [32]:
merged_frame.head(10)

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list_x,status,country_code,state_code,region,city,founded_at,primary_sector,main_sector
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,,media,Entertainment
1,/organization/90min,/funding-round/21a2cbf6f2fb2a1c2a61e04bf930dfe6,venture,,06-10-2015,15000000.0,/organization/90min,90min,http://www.90min.com,Media|News|Publishing|Soccer|Sports,operating,GBR,H9,London,London,01-01-2011,media,Entertainment
2,/organization/90min,/funding-round/bd626ed022f5c66574b1afe234f3c90d,venture,,07-05-2013,5800000.0,/organization/90min,90min,http://www.90min.com,Media|News|Publishing|Soccer|Sports,operating,GBR,H9,London,London,01-01-2011,media,Entertainment
3,/organization/90min,/funding-round/fd4b15e8c97ee2ffc0acccdbe1a98810,venture,,26-03-2014,18000000.0,/organization/90min,90min,http://www.90min.com,Media|News|Publishing|Soccer|Sports,operating,GBR,H9,London,London,01-01-2011,media,Entertainment
4,/organization/a-dance-for-me,/funding-round/9ab9dbd17bf010c79d8415b2c22be6fa,equity_crowdfunding,,26-03-2014,1090000.0,/organization/a-dance-for-me,A Dance for Me,http://www.adanceforme.com/,Media|News|Photo Sharing|Video,operating,USA,MT,Missoula,Missoula,31-07-2011,media,Entertainment
5,/organization/akira-mobile,/funding-round/bfb170aea580e381e5b1810c87855c9e,seed,,01-08-2012,18410.0,/organization/akira-mobile,Akira Mobile,http://www.akira.lt/en,Media|Mobile|SMS|Telecommunications,operating,LTU,,,,,media,Entertainment
6,/organization/all-day-media,/funding-round/2760c0426a124b84c540bd4fd2dfe6e5,seed,,16-12-2014,2000000.0,/organization/all-day-media,ALL DAY MEDIA,http://allday.com/,Media|Social Media,operating,USA,CA,Los Angeles,Los Angeles,01-01-2013,media,Entertainment
7,/organization/all-def-digital,/funding-round/452a2342fe720285c3b92e9bd927d9ba,venture,A,06-08-2014,5000000.0,/organization/all-def-digital,All Def Digital,http://alldefdigital.com,Media,operating,USA,CA,Los Angeles,Los Angeles,,media,Entertainment
8,/organization/america-s-real-deal,/funding-round/5fbb191b97ca9789a7196de703983240,equity_crowdfunding,,19-08-2011,670000.0,/organization/america-s-real-deal,America's Real Deal,http://americasrealdeal.com/,Media|News,operating,USA,UT,Salt Lake City,Bountiful,03-04-2014,media,Entertainment
9,/organization/american-gnuity,/funding-round/07cc3eb3afd8ef5812e2b62b254b2040,equity_crowdfunding,,19-08-2011,670000.0,/organization/american-gnuity,American Gnuity,http://AmericanGnuity.com,Media|News,operating,USA,UT,Salt Lake City,Bountiful,03-04-2014,media,Entertainment


## Task 5: Sector Analysis 

-  ### Subtask 5.1: Finding out the most heavily invested main sectors in each of the three countries (for funding type FT and investments range of 5-15 M USD)
First we would create 1 common dataframe for all 3 countries then derive D1, D2, D3 from it.


In [33]:
# Filtering merged_frame for top3 countries
D = merged_frame.loc[merged_frame['country_code'].isin(['USA', 'GBR', 'IND'])]

# Filtering D for funding_round_type = venture and raised_amount_usd between 5-15 million
D = merged_frame.loc[((merged_frame['funding_round_type']=='venture') & (merged_frame.raised_amount_usd>=5000000) & (merged_frame.raised_amount_usd<=15000000))]

#The total number (or count) of investments for each main sector in a separate column based on country
D['total_number_of_investment'] = D.groupby(['country_code','main_sector'])['main_sector'].transform('count')


#The total amount invested in each main sector in a separate column
D['total_amount_invested'] = D.groupby(['country_code','main_sector'])['raised_amount_usd'].transform('sum')

# Creating specific dataframes for each country
D1 = D.loc[(D['country_code']=='USA')]
D2 = D.loc[(D['country_code']=='GBR')]
D3 = D.loc[(D['country_code']=='IND')]


-  ### Doing for country_code == USA  
      -  ### (United States)

In [34]:
# inspecting the created dataframe for USA

D1.info()

D1.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12063 entries, 7 to 88439
Data columns (total 20 columns):
company_permalink             12063 non-null object
funding_round_permalink       12063 non-null object
funding_round_type            12063 non-null object
funding_round_code            8618 non-null object
funded_at                     12063 non-null object
raised_amount_usd             12063 non-null float64
permalink                     12063 non-null object
name                          12063 non-null object
homepage_url                  11495 non-null object
category_list_x               12063 non-null object
status                        12063 non-null object
country_code                  12063 non-null object
state_code                    12061 non-null object
region                        12054 non-null object
city                          12054 non-null object
founded_at                    10390 non-null object
primary_sector                12063 non-null object
main_se

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list_x,status,country_code,state_code,region,city,founded_at,primary_sector,main_sector,total_number_of_investment,total_amount_invested
7,/organization/all-def-digital,/funding-round/452a2342fe720285c3b92e9bd927d9ba,venture,A,06-08-2014,5000000.0,/organization/all-def-digital,All Def Digital,http://alldefdigital.com,Media,operating,USA,CA,Los Angeles,Los Angeles,,media,Entertainment,591,5099197982.0
31,/organization/chefs-feed,/funding-round/adca195749ae9ace84684723fbe75e5b,venture,A,26-02-2015,5000000.0,/organization/chefs-feed,ChefsFeed,http://www.chefsfeed.com,Media|Mobile|Restaurants|Technology,operating,USA,CA,SF Bay Area,San Francisco,01-01-2012,media,Entertainment,591,5099197982.0
61,/organization/huffingtonpost,/funding-round/7f05940c4d2dfecb8e50a0e5720e5065,venture,A,01-08-2006,5000000.0,/organization/huffingtonpost,The Huffington Post,http://www.huffingtonpost.com,Media|News|Publishing,acquired,USA,NY,New York City,New York,09-05-2005,media,Entertainment,591,5099197982.0
62,/organization/huffingtonpost,/funding-round/9241ae16e08df17ebdc064e49e23035a,venture,B,01-09-2007,5000000.0,/organization/huffingtonpost,The Huffington Post,http://www.huffingtonpost.com,Media|News|Publishing,acquired,USA,NY,New York City,New York,09-05-2005,media,Entertainment,591,5099197982.0
85,/organization/matchmine,/funding-round/41ac526630da57ad6eb9d02431b17657,venture,A,01-09-2007,10000000.0,/organization/matchmine,MatchMine,http://matchmine.com,Media|News|Reviews and Recommendations,closed,USA,MA,Boston,Needham,01-01-2007,media,Entertainment,591,5099197982.0
88,/organization/mediabong,/funding-round/9282890ca87072025dc1807f400acee6,venture,B,13-05-2015,5000000.0,/organization/mediabong,MEDIABONG,http://www.mediabong.com,Media|Semantic Search|Video,operating,USA,NY,New York City,New York,30-05-2011,media,Entertainment,591,5099197982.0
97,/organization/newscorporation,/funding-round/8f6d7c4592e43e91e8688ba342bffcb7,venture,,08-01-2010,12500000.0,/organization/newscorporation,News Corp,http://www.newscorp.com,Media|News|Publishing,ipo,USA,NY,New York City,New York,01-01-2013,media,Entertainment,591,5099197982.0
102,/organization/nokeena,/funding-round/9225f2db6b1b74892d5de6a8744b94ea,venture,A,08-08-2008,9400000.0,/organization/nokeena,Ankeena Networks,http://www.ankeena.com,Media|Software,acquired,USA,CA,SF Bay Area,Santa Clara,01-01-2008,media,Entertainment,591,5099197982.0
104,/organization/nokeena,/funding-round/fb7a617ed6d1a3203024f0f111417bfb,venture,B,25-06-2009,6500000.0,/organization/nokeena,Ankeena Networks,http://www.ankeena.com,Media|Software,acquired,USA,CA,SF Bay Area,Santa Clara,01-01-2008,media,Entertainment,591,5099197982.0
113,/organization/plumtv,/funding-round/e5109c28c1b4899b068cfa552850c424,venture,B,01-09-2009,5200000.0,/organization/plumtv,PlumTV,http://www.plumtv.com,Media|Television|Web Hosting,closed,USA,NY,New York City,New York,01-01-2002,media,Entertainment,591,5099197982.0


In [35]:
# Total number of investments  and Total amount of investment(in USD) for USA

print(D1['main_sector'].count())

print(D1['raised_amount_usd'].sum())


12063
107757097294.0


In [36]:
# Finding the top 3 sectors based on number of investments for USA

pd.DataFrame(D1.groupby('main_sector').total_number_of_investment.mean()).sort_values(by='total_number_of_investment', ascending=False)


# FOR USA:-
# We can see that 'Others' have highest total_number_of_investment 
# followed by 'Social, Finance, Analytics, Advertising' and 'Cleantech / Semiconductors'

Unnamed: 0_level_0,total_number_of_investment
main_sector,Unnamed: 1_level_1
Others,2950
"Social, Finance, Analytics, Advertising",2714
Cleantech / Semiconductors,2350
"News, Search and Messaging",1583
Health,909
Manufacturing,799
Entertainment,591
Automotive & Sports,167


In [37]:
#  For the top sector (Others) count-wise (point 3), which company received the highest investment?

D1.loc[(D1['main_sector']=='Others')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/virtustream           64300000.00
/organization/capella               54968051.00
/organization/airtight-networks     54201907.00
/organization/decarta               52100000.00
/organization/black-duck-software   51000000.00
/organization/approva               50100000.00
/organization/five9                 49600000.00
/organization/bit9                  48433533.00
/organization/aryaka-networks       48166500.00
/organization/bti-systems           48000000.00
Name: raised_amount_usd, dtype: float64

In [55]:
# Finding the name of the company which received the highest investment in Other sector

D1.loc[(D1['permalink']=='/organization/virtustream')].name.head(1)

40935    Virtustream
Name: name, dtype: object

In [39]:
#  For the second best sector count-wise (point 3), which company received the highest investment?

D1.loc[(D1['main_sector']=='Social, Finance, Analytics, Advertising')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/shotspotter    67933006.00
/organization/demandbase     63000000.00
/organization/intacct        61800000.00
/organization/netbase        60600000.00
/organization/lotame         59700000.00
/organization/firstrain      58344731.00
/organization/choicestream   58300000.00
/organization/optier         55600000.00
/organization/damballa       55000000.00
/organization/zoove          54800000.00
Name: raised_amount_usd, dtype: float64

In [40]:
# Finding the name of the company which received the highest investment 
# in 'Social, Finance, Analytics, Advertising' sector

D1.loc[(D1['permalink']=='/organization/shotspotter')].name.head(1)

16803    SST Inc. (Formerly ShotSpotter)
Name: name, dtype: object

-  ### Doing for country_code == GBR
     -  ### United Kingdom

In [41]:
# inspecting the created dataframe for GBR

D2.info()

D2.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621 entries, 1 to 88247
Data columns (total 20 columns):
company_permalink             621 non-null object
funding_round_permalink       621 non-null object
funding_round_type            621 non-null object
funding_round_code            385 non-null object
funded_at                     621 non-null object
raised_amount_usd             621 non-null float64
permalink                     621 non-null object
name                          621 non-null object
homepage_url                  592 non-null object
category_list_x               621 non-null object
status                        621 non-null object
country_code                  621 non-null object
state_code                    607 non-null object
region                        592 non-null object
city                          592 non-null object
founded_at                    498 non-null object
primary_sector                621 non-null object
main_sector                   621 non-null

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list_x,status,country_code,state_code,region,city,founded_at,primary_sector,main_sector,total_number_of_investment,total_amount_invested
1,/organization/90min,/funding-round/21a2cbf6f2fb2a1c2a61e04bf930dfe6,venture,,06-10-2015,15000000.0,/organization/90min,90min,http://www.90min.com,Media|News|Publishing|Soccer|Sports,operating,GBR,H9,London,London,01-01-2011,media,Entertainment,56,482784687.0
2,/organization/90min,/funding-round/bd626ed022f5c66574b1afe234f3c90d,venture,,07-05-2013,5800000.0,/organization/90min,90min,http://www.90min.com,Media|News|Publishing|Soccer|Sports,operating,GBR,H9,London,London,01-01-2011,media,Entertainment,56,482784687.0
225,/organization/common-interest-communities,/funding-round/8195587cbd5e51af7514ee92ef4ba6ba,venture,,09-07-2014,10000000.0,/organization/common-interest-communities,Common Interest Communities,http://commoninterestcommunities.com/,Application Platforms|Internet|Software|Startups,operating,GBR,H9,London,London,,application platforms,"News, Search and Messaging",73,615746235.0
257,/organization/geospock-ltd-,/funding-round/cf3fe3b7c86186b9f478d0ea37613f7a,venture,,01-10-2014,5460000.0,/organization/geospock-ltd-,GeoSpock Ltd.,http://www.geospock.com,Application Platforms|Databases|Real Time,operating,GBR,C3,London,Cambridge,01-01-2013,application platforms,"News, Search and Messaging",73,615746235.0
258,/organization/geospock-ltd-,/funding-round/e5e4ef4ebae63fc36ef0cd57dd20ff1c,venture,A,05-10-2015,5400000.0,/organization/geospock-ltd-,GeoSpock Ltd.,http://www.geospock.com,Application Platforms|Databases|Real Time,operating,GBR,C3,London,Cambridge,01-01-2013,application platforms,"News, Search and Messaging",73,615746235.0
367,/organization/tao-group-2,/funding-round/ad088f1deeda09f3338adfc324e32dab,venture,,06-05-2004,7000000.0,/organization/tao-group-2,Tao Group,http://tao-group.com/,Application Platforms|Consumer Electronics,operating,GBR,K7,London,Reading,01-01-1992,application platforms,"News, Search and Messaging",73,615746235.0
407,/organization/workangel,/funding-round/3ff84c41cfa24575bd6ea60b78f580a6,venture,A,19-01-2015,5000000.0,/organization/workangel,WorkAngel,http://workangel.com/,Application Platforms|Employer Benefits Progra...,operating,GBR,H9,London,London,01-01-2013,application platforms,"News, Search and Messaging",73,615746235.0
548,/organization/azimo,/funding-round/5eb768935cf9c60b402944b0f476baae,venture,A,11-03-2014,10000000.0,/organization/azimo,Azimo,http://azimo.com,Curated Web|Finance Technology|Financial Servi...,operating,GBR,H9,London,London,29-10-2012,curated web,"News, Search and Messaging",73,615746235.0
560,/organization/basekit-platform,/funding-round/8252cd70860ec66a1c7d13b6a2519dc6,venture,,13-05-2014,7000000.0,/organization/basekit-platform,BaseKit,http://www.basekit.com,Curated Web|Internet|Web Design,operating,GBR,H9,London,London,01-01-2008,curated web,"News, Search and Messaging",73,615746235.0
561,/organization/basekit-platform,/funding-round/9ceb098a822fb971db490b23c1067336,venture,,03-02-2014,6818631.0,/organization/basekit-platform,BaseKit,http://www.basekit.com,Curated Web|Internet|Web Design,operating,GBR,H9,London,London,01-01-2008,curated web,"News, Search and Messaging",73,615746235.0


In [42]:
# Total number of investments  and Total amount of investment(in USD) for GBR

print(D2['main_sector'].count())

print(D2['raised_amount_usd'].sum())

621
5379078691.0


In [43]:
# Finding the top 3 sectors based on number of investments for GBR

pd.DataFrame(D2.groupby('main_sector').total_number_of_investment.mean()).sort_values(by='total_number_of_investment', ascending=False)


# FOR GBR (United Kingdom)
# We can see that 'Others' have highest total_number_of_investment 
# followed by 'Social, Finance, Analytics, Advertising' and 'Cleantech / Semiconductors'

Unnamed: 0_level_0,total_number_of_investment
main_sector,Unnamed: 1_level_1
Others,147
"Social, Finance, Analytics, Advertising",133
Cleantech / Semiconductors,130
"News, Search and Messaging",73
Entertainment,56
Manufacturing,42
Health,24
Automotive & Sports,16


In [44]:
#  For the top sector(Others) count-wise (point 3), which company received the highest investment?

D2.loc[(D2['main_sector']=='Others')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/electric-cloud            37000000.00
/organization/sensage                   36250000.00
/organization/enigmatic                 32500000.00
/organization/silverrail-technologies   29000000.00
/organization/opencloud                 27972766.00
/organization/myoptique-group           27694613.00
/organization/notonthehighstreet        26900000.00
/organization/acs-clothing              26239014.00
/organization/tribold                   26000000.00
/organization/ip-access                 25000000.00
Name: raised_amount_usd, dtype: float64

In [45]:
# Finding the name of the company which received the highest investment in Others sector

D2.loc[(D2['permalink']=='/organization/electric-cloud')].name.head(1)

34333    Electric Cloud
Name: name, dtype: object

In [46]:
#  For the second best sector count-wise (point 3), which company received the highest investment?

D2.loc[(D2['main_sector']=='Social, Finance, Analytics, Advertising')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/celltick-technologies   37500000.00
/organization/mythings                34000000.00
/organization/zopa                    32900000.00
/organization/imagini                 28550000.00
/organization/marketinvoice           25553007.00
/organization/sumup                   24293649.00
/organization/amplience               24200000.00
/organization/garlik                  22350000.00
/organization/sportpursuit            22024487.00
/organization/victor                  21500000.00
Name: raised_amount_usd, dtype: float64

In [47]:
# Finding the name of the company which received the highest investment 
# in 'Social, Finance, Analytics, Advertising' sector

D2.loc[(D2['permalink']=='/organization/celltick-technologies')].name.head(1)

43677    Celltick Technologies
Name: name, dtype: object

-  ### Doing for country_code == IND
      -  ### India

In [48]:
# inspecting the created dataframe for India

D3.info()

D3.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 0 to 88139
Data columns (total 20 columns):
company_permalink             328 non-null object
funding_round_permalink       328 non-null object
funding_round_type            328 non-null object
funding_round_code            228 non-null object
funded_at                     328 non-null object
raised_amount_usd             328 non-null float64
permalink                     328 non-null object
name                          328 non-null object
homepage_url                  325 non-null object
category_list_x               328 non-null object
status                        328 non-null object
country_code                  328 non-null object
state_code                    327 non-null object
region                        327 non-null object
city                          327 non-null object
founded_at                    282 non-null object
primary_sector                328 non-null object
main_sector                   328 non-null

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd,permalink,name,homepage_url,category_list_x,status,country_code,state_code,region,city,founded_at,primary_sector,main_sector,total_number_of_investment,total_amount_invested
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,,media,Entertainment,33,280830000.0
550,/organization/babajob,/funding-round/b72eaac5ea12ac0f50573ac3d6d46b8d,venture,B,28-04-2015,10000000.0,/organization/babajob,Babajob,http://www.babajob.com,Curated Web|Information Technology|Services|St...,operating,IND,19,Bangalore,Bangalore,27-08-2007,curated web,"News, Search and Messaging",52,433834545.0
593,/organization/bharat-matrimony,/funding-round/e37673bc7b0f1dfd3782f8f7abdb9ec8,venture,B,05-02-2008,11750000.0,/organization/bharat-matrimony,Bharat Matrimony,http://www.bharatmatrimony.com,Curated Web|Match-Making,operating,IND,25,Chennai,Chennai,12-03-1969,curated web,"News, Search and Messaging",52,433834545.0
640,/organization/bluestone-com,/funding-round/452a7fc1f34df2d3dcda4e28234bc671,venture,A,24-01-2012,5000000.0,/organization/bluestone-com,Bluestone.com,http://bluestone.com,Curated Web,operating,IND,19,Bangalore,Bangalore,01-01-2011,curated web,"News, Search and Messaging",52,433834545.0
642,/organization/bluestone-com,/funding-round/f5b252d6442ce231bb01586ca1821f63,venture,B,18-03-2014,10000000.0,/organization/bluestone-com,Bluestone.com,http://bluestone.com,Curated Web,operating,IND,19,Bangalore,Bangalore,01-01-2011,curated web,"News, Search and Messaging",52,433834545.0
1205,/organization/fourinteractive,/funding-round/a0ef5f6e0c69a9060c1649c076999336,venture,B,01-09-2007,10000000.0,/organization/fourinteractive,Four Interactive,http://www.fourint.com,Curated Web,operating,IND,19,Bangalore,Bangalore,01-01-2006,curated web,"News, Search and Messaging",52,433834545.0
1657,/organization/localbanya,/funding-round/087fbb346606a864c03199ec3189e67b,venture,A,09-01-2014,5000000.0,/organization/localbanya,LocalBanya,http://localbanya.com,Curated Web,operating,IND,16,Mumbai,Mumbai,01-05-2012,curated web,"News, Search and Messaging",52,433834545.0
1664,/organization/localoye,/funding-round/b56bf538e5e50ae038359a6334862a5a,venture,A,14-04-2015,5000000.0,/organization/localoye,LocalOye,http://localoye.com,Curated Web,operating,IND,16,Mumbai,Mumbai,01-01-2013,curated web,"News, Search and Messaging",52,433834545.0
1818,/organization/mobikwik,/funding-round/6cb899c717aab8ff314ca4b257124377,venture,A,01-01-2013,5000000.0,/organization/mobikwik,MobiKwik,http://www.mobikwik.com,Curated Web|Internet|Mobile|Mobile Payments|Pa...,operating,IND,10,New Delhi,Gurgaon,01-08-2009,curated web,"News, Search and Messaging",52,433834545.0
1832,/organization/money-on-mobile,/funding-round/fe6c80376b0e82118d2716049a1f411c,venture,,16-09-2013,10000000.0,/organization/money-on-mobile,Money On Mobile,http://www.money-on-mobile.net,Curated Web|Mobile,operating,IND,16,Mumbai,Mumbai,01-01-2010,curated web,"News, Search and Messaging",52,433834545.0


In [49]:
# Total number of investments  and Total amount of investment(in USD) for IND

print(D3['main_sector'].count())

print(D3['raised_amount_usd'].sum())

328
2949543602.0


In [50]:
# Finding the top 3 sectors based on number of investments for IND

pd.DataFrame(D3.groupby('main_sector').total_number_of_investment.mean()).sort_values(by='total_number_of_investment', ascending=False)


# FOR IND
# We can see that 'Others' have highest total_number_of_investment 
# followed by 'Social, Finance, Analytics, Advertising' and 'News, Search and Messaging'

Unnamed: 0_level_0,total_number_of_investment
main_sector,Unnamed: 1_level_1
Others,110
"Social, Finance, Analytics, Advertising",60
"News, Search and Messaging",52
Entertainment,33
Manufacturing,21
Cleantech / Semiconductors,20
Health,19
Automotive & Sports,13


In [51]:
#  For the top sector(Others) count-wise (point 3), which company received the highest investment?

D3.loc[(D3['main_sector']=='Others')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/firstcry-com                                                             39000000.00
/organization/myntra                                                                   38000000.00
/organization/commonfloor                                                              32900000.00
/organization/pepperfry-com                                                            28000000.00
/organization/itzcash-card-ltd                                                         25000000.00
/organization/nxtgen-data-center-cloud-services                                        22300000.00
/organization/maharana-infrastructure-and-professional-services-private-limited-mips   21600000.00
/organization/caratlane                                                                21000000.00
/organization/comat-technologies                                                       21000000.00
/organization/limeroad                                                                 20000000.00


In [52]:
# Finding the name of the company which received the highest investment in Others sector

D3.loc[(D3['permalink']=='/organization/firstcry-com')].name.head(1)

21181    FirstCry.com
Name: name, dtype: object

In [53]:
#  For the second best sector count-wise (point 3), which company received the highest investment?

D3.loc[(D3['main_sector']=='Social, Finance, Analytics, Advertising')].groupby('permalink')['raised_amount_usd'].sum().sort_values(ascending=False).head(10)

permalink
/organization/manthan-systems                                50700000.00
/organization/komli-media                                    28000000.00
/organization/shopclues-com                                  25000000.00
/organization/intarvo                                        21900000.00
/organization/grameen-financial-services                     21556050.00
/organization/bankbazaar                                     19000000.00
/organization/microland                                      18300000.00
/organization/eka-software-solutions                         16000000.00
/organization/qyuki                                          15400000.00
/organization/financial-information-network-operations-pvt   15000000.00
Name: raised_amount_usd, dtype: float64

In [54]:
# Finding the name of the company which received the highest investment 
# in 'Social, Finance, Analytics, Advertising' sector

D3.loc[(D3['permalink']=='/organization/manthan-systems')].name.head(1)

16003    Manthan Systems
Name: name, dtype: object