## Investment Case Group Project

In [None]:
import pandas as pd
import numpy as np
dirpath = 'C:/Users/ankit.bhatia/Documents/GitHub/PythonScripts/data/spark_funds/'
file1 = 'companies.txt'
file2 = 'rounds2.csv'
file3 = 'mapping.csv'

In [None]:
companies = pd.read_csv(dirpath+file1,sep='\t',encoding='iso-8859-1')
rounds2 = pd.read_csv(dirpath+file2,encoding='iso-8859-1') # Use 'iso-8859-1' for accented characters

In [None]:
companies.head(3)

In [None]:
rounds2.head(2)

### Table 1.1
#### Q1. How many unique companies are present in rounds2?

In [None]:
# Normal Count
#rounds2.company_permalink.count()

# Unique count which excludes NA as well by default
rounds2.company_permalink.nunique()

#### Q2. How many unique companies are present in companies?

In [None]:
companies.permalink.nunique()

#### Q3. In the companies data frame, which column can be used as the unique key for each company? Write the name of the column.

In [None]:
companies.nunique().sort_values(ascending=False).iloc[0:1]
# nunique() will give unique values for each column, column with maximum unique counts can be considered as unique column#### 

#### Q4. Are there any companies in the rounds2 file which are not  present in companies ?

In [None]:
# Check the existacce of a column of dataframe1 in column1 of dataframe2
rounds2[rounds2['company_permalink'].str.lower().isin(companies['permalink'].str.lower())== False]

#### Q5. Merge the two data frames so that all  variables (columns)  in the companies frame are added to the rounds2 data frame. Name the merged frame master_frame. How many observations are present in master_frame ?

In [None]:
# lower() the key columns of both dataframes, so they can match in case of diffence in case.
companies['permalink'] = companies['permalink'].str.lower()
rounds2['company_permalink'] = rounds2['company_permalink'].str.lower()
master_frame = pd.merge(left = companies,right = rounds2, how ='inner', left_on='permalink',right_on='company_permalink')
master_frame.shape[0]

### Table 2.1 ( Average Values of Investments for Each of these Funding Types)
#### Q1/2/3/4. Average funding amount of different funding type ?

In [None]:
avg_venture_funding_amount = rounds2[rounds2['funding_round_type']=='venture']['raised_amount_usd'].mean()
print(avg_venture_funding_amount)

avg_angel_funding_amount = rounds2[rounds2['funding_round_type']=='angel']['raised_amount_usd'].mean()
print(avg_angel_funding_amount)

avg_seed_funding_amount = rounds2[rounds2['funding_round_type']=='seed']['raised_amount_usd'].mean()
print(avg_seed_funding_amount)

avg_private_equity_funding_amount = rounds2[rounds2['funding_round_type']=='private_equity']['raised_amount_usd'].mean()
print(avg_private_equity_funding_amount)

#### Q5. Considering that Spark Funds wants to invest between 5 to 15 million USD per  investment round, which investment type is the most suitable for them?

In [None]:
# transform is used to calculate  aggregated column with full table
most_suitable = rounds2.groupby(['funding_round_code','funding_round_type'])['raised_amount_usd'].transform(sum)
# Add new column to the rounds2 table.
rounds2['most_suitable'] = most_suitable
rounds2_filtered = rounds2.query('most_suitable >= 5000000 and most_suitable <= 15000000')
rounds2_filtered_grouping = rounds2_filtered.groupby('funding_round_type')['funding_round_code'].count().sort_values(ascending=False)
rounds2_filtered_grouping

### Table 3.1 ( Analysing the Top 3 English-Speaking Countries)
#### Q1/2/3. Top/Second/Third English speaking country ?

In [None]:
merged9 = pd.merge(left = rounds2_filtered,right = companies, how ='inner', left_on='company_permalink',right_on='permalink')
top9 = merged9.groupby('country_code')['most_suitable'].sum().sort_values(ascending=False)
top9

### Table 3.1 ( Sector-wise Investment Analysis)
#### Q1. Total number of Investments (count)

In [None]:
sector_wise = companies.copy()
sector_wise['primary_sector'] = sector_wise['category_list'].str.split('|').str.get(0)
sector_wise.head(3)


In [None]:
mapping =  pd.read_csv(dirpath+file3,encoding='iso-8859-1') # Use 'iso-8859-1' for accented characters
mapping.head()

In [None]:
# Defining function demap, which will convert data in mapping dataframe above to two columns output.
def demap(df):
    l = list()
    c = int(df.count().sort_values(ascending=False)[0])
    for i in range(c):
        for col_name in df.columns:
            if df[col_name][i]==1:
               l.append(col_name)
    df2 = df.copy()
    df2['final_value'] = l
    return df2.iloc[:,[0,-1]]

In [None]:
mapping_new = demap(mapping)
mapping_new.head()

In [None]:
companies_sector_wise = pd.merge(left=sector_wise, right=mapping_new,left_on='primary_sector',right_on='category_list')
# Drop unnecessary columns
companies_sector_wise.drop(labels=['category_list_x', 'category_list_y'],axis=1, inplace=True)
# Rename Columns
companies_sector_wise.rename(columns={'final_value':'main_sector'}, inplace=True)

In [None]:
companies_sector_wise.head()

In [None]:
master = pd.merge(left=companies_sector_wise,right=rounds2_filtered,left_on='permalink',right_on='company_permalink')
master.head()

Create three separate data frames D1, D2 and D3 for each of the three countries containing the observations of funding type FT falling within the 5-15 million USD range. 
The three data frames should contain:
All the columns of the master_frame along with the primary sector and the main sector
The total number (or count) of investments for each main sector in a separate column
The total amount invested in each main sector in a separate column

In [None]:
#USA
D1 = master[master['country_code']==top9.index[0]]

#TUR
D2 = master[master['country_code']==top9.index[1]]

#SWE
D3 = master[master['country_code']==top9.index[2]]