# Environment Set Up

In [None]:
# G drive connection
# learn: to access a shared file just put a shortcut in mi drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Libraries import

import pandas as pd
from tabulate import tabulate
from matplotlib import pyplot as plt
import seaborn as sns

# Connection test

In [None]:
# Verify reading folder

!ls '/content/drive/MyDrive/andresrokp/Thesis Juandavid/'

In [None]:
# Hola mundo

file_path = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/Companies.dta'
df = pd.read_stata(file_path)
print('\n---\ninfo\n---')
df.info()  # Show column
print('\n---\ndescribe\n---')
print(df.describe())  # Show very basic stadistics
print('\n---\nhead\n---')
df.head()  # Show first rows


# Helpers Functions

In [None]:
def akap_pretty_print_df(df, rows):

    rows_head = rows
    rows_tail = rows

    if rows < 0:
      rows_head = -rows * 10
      rows_tail = 0

    columns = df.columns.tolist()
    ellipsis_values = ['...' for _ in columns]
    ellipsis_row = pd.DataFrame([ellipsis_values], columns=columns)
    head = df.head(rows_head)
    tail = df.tail(rows_tail)
    payload = tabulate(pd.concat([head,ellipsis_row,tail]),headers='keys', tablefmt='pretty')
    print( payload )
    return payload

def akap_txt_file_writer(df, file_name, rows):
  file_path = f'/content/drive/MyDrive/andresrokp/Thesis Juandavid/{file_name}.txt'
  columns = df.columns.tolist()
  with open(file_path,'w') as txt_file:
    txt_file.write(akap_pretty_print_df(df, rows))

# Dataset loading

In [None]:
# File paths
path_companies_file = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/Companies.dta'
path_ipo_m_a_file = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/IPO_M&A.dta'
path_investments_file = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/Investments.dta'
path_investors_file = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/Investors.dta'
path_vico = '/content/drive/MyDrive/andresrokp/Thesis Juandavid/VICO5.0_Investors_Reputation_withOldIDs.dta'

# Read Companies data
print("Reading Companies data...")
df_companies = pd.read_stata(path_companies_file)
print("\nCompanies Data Info:")
df_companies.info()
akap_pretty_print_df(df_companies, 5)

# Read IPO M&A data
print("\nReading IPO M&A data...")
df_ipo_ma = pd.read_stata(path_ipo_m_a_file)
print("\nIPO M&A Data Info:")
df_ipo_ma.info()
akap_pretty_print_df(df_ipo_ma, 5)

# Read Investors data
print("\nReading Investors data...")
df_investors = pd.read_stata(path_investors_file)
# cleaning dejar solo VCs
print("\Cleaning Investors data...")
df_investors_clean = df_investors[df_investors['InvestorType'] != 'BA']
df_investors_clean = df_investors_clean[df_investors_clean['InvestorType'] != 'Other']
print("\nInvestors Clean Data Info:")
df_investors_clean.info()
akap_pretty_print_df(df_investors_clean, 5)

# Read Investments data
print("\nReading Investments data...")
df_investments = pd.read_stata(path_investments_file)
df_investments_columns = df_investments.columns.tolist()
df_investments = pd.merge(df_investments, df_investors, on='InvestorID', how='left')
print("\nCleaning Investments data...")
df_investments_merge_clean = df_investments[df_investments['InvestorType'] != 'BA']
df_investments_merge_clean = df_investments_merge_clean[df_investments_merge_clean['InvestorType'] != 'Other']
df_investments_merge_clean = df_investments_merge_clean[~df_investments_merge_clean['InvestorID'].str.contains('missing')]
df_investments_clean = df_investments_merge_clean[df_investments_columns]
print("\nInvestments Clean Data Info:")
df_investments_clean.info()
akap_pretty_print_df(df_investments_clean, 5)

# Read VICO5 data
# print("\nReading vico5_BNT data...")
# df_vico5_BNT = pd.read_stata(path_vico)
# print("\nVICO5 Data Info:")
# df_vico5_BNT.info()
# print("Number of Rows in vico5_BNT Data:", len(df_vico5_BNT))

# Interesting table builder

In [None]:
'''

'''

# MERGING
print('\n\n\n----------MERGING----------')
# df_investments_clean >>InvestorID>> df_investors
df_merge_investments_investors = pd.merge(df_investments_clean, \
                                          df_investors_clean, \
                                          on='InvestorID', \
                                          how='left')
print( '\n<o>\n info df_merge_investments_investors.info() \n' )
df_merge_investments_investors.info()
akap_pretty_print_df(df_merge_investments_investors, 10)
# >> companies ON CompanyID
df_merge_investments_investors_companies = pd.merge(df_merge_investments_investors, \
                                           df_companies, \
                                           on='CompanyID', \
                                           how='left')
print( '\n<o>\n info df_merge_investments_investors_companies.info() \n' )
df_merge_investments_investors_companies.info()
akap_pretty_print_df(df_merge_investments_investors_companies, 10)
# >> aipió má
df_merge_investments_investors_companies_ipoma = pd.merge(df_merge_investments_investors_companies, \
                                                 df_ipo_ma, \
                                                 on='CompanyID', \
                                                 how='left')
print( '\n<o>\n info df_merge_investments_investors_companies_ipoma.info() \n' )
df_merge_investments_investors_companies_ipoma.info()
akap_pretty_print_df(df_merge_investments_investors_companies_ipoma, 10)



# LEAN df // STANDARIZE var NAME
print('\n\n\n----------LEAN df // STANDARIZE var NAME----------')
df_interesante_1 = df_merge_investments_investors[['CompanyID','InvestorID', 'InvestorName', 'InvestorNation', 'InvestorType','TotalEquityInvested_round_thEUR']]
print( '\n<o>\n info df_interesante_1.info() \n' )
df_interesante_1.info()
akap_txt_file_writer(df_interesante_1, 'df_merge_investments_investors', 10)

df_interesante_2 = df_merge_investments_investors_companies[['InvestmentDate','CompanyID','CompanyNUTS3Name','NACERev2corcodesdes','FirstInvestmentYear','CompanyNation','TotalEquityInvested_round_thEUR','InvestorID','InvestorName','InvestorType','InvestorNation']]
print( '\n<o>\n info df_interesante_2.info() \n' )
df_interesante_2.info()
akap_txt_file_writer(df_interesante_2, 'df_merge_investments_investors_companies', 10)

df_interesante_3 = df_merge_investments_investors_companies_ipoma[['InvestmentDate','CompanyID','CompanyNUTS3Name','NACERev2corcodesdes','FirstInvestmentYear','CompanyNation','TotalEquityInvested_round_thEUR','InvestorID','InvestorName','InvestorType','InvestorNation','IPO_dummy','IPODate','IPODealvaluethEUR','Acquisition_dummy','AcqDate','AcqDealvaluethEUR']]
print( '\n<o>\n info df_interesante_3.info() \n' )
df_interesante_3.info()
akap_txt_file_writer(df_interesante_3, 'df_merge_investments_investors_companies_ipoma', 300)

# readaptacion
# df_interesante = df_interesante_2

# AGRUPAMIENTOS
# print('\n\n\n----------AGRUPAMIENTOS----------')
# df_interesante_grouped = df_interesante.groupby(['InvestorID', 'InvestorName', 'InvestorNation', 'InvestorType']) \
#                                          .size() \
#                                          .reset_index(name='InvestorCount') \
#                                          .sort_values(by='InvestorCount', ascending=False) \
#                                          .reset_index(drop=True)
# print( '\n<o>\n info df_interesante_1_grouped.info() \n' )
# df_interesante_grouped.info()
# akap_pretty_print_df(df_interesante_grouped, 10)


# MOCHADA TOP 25
# print('\n\n\n----------MOCHADA TOP 25----------')
# print("\n<o>\n info df_merge_investments_investors_grouped_top25 conteo de apariciones TOP 25% \n")
# df_interesante_grouped_top25 = df_interesante_grouped.head(round(len(df_interesante_grouped)*0.25))
# akap_pretty_print_df(df_interesante_grouped_top25, 10)

In [None]:
df_interesante_1_grouped

In [None]:
df_interesante_1_grouped.groupby('InvestorNation').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.show()
print('.')
df_interesante_1_grouped.groupby('InvestorType').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.show()
print('.')
df_interesante_1_grouped['InvestorCount'].plot(kind='line', figsize=(8, 4), title='InvestorCount')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()
print('.')

# Modelo 4: Hochberg, Ljungqvist, and Lu (2007).

In [None]:
'''
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   InvestmentDate                   71077 non-null  datetime64[ns]
 1   CompanyID                        71077 non-null  object
 2   CompanyNUTS3Name                 71077 non-null  object
 3   NACERev2corcodesdes              71077 non-null  object
 4   FirstInvestmentYear              71077 non-null  float32
 5   CompanyNation                    71077 non-null  object
 6   TotalEquityInvested_round_thEUR  53320 non-null  float64
 7   InvestorID                       71077 non-null  object
 8   InvestorName                     71077 non-null  object
 9   InvestorType                     71077 non-null  object
 10  InvestorNation                   71077 non-null  object
 11  IPO_dummy                        10064 non-null  float32
 12  IPODate                          2344 non-null   datetime64[ns]
 13  IPODealvaluethEUR                10064 non-null  object
 14  Acquisition_dummy                10064 non-null  float32
 15  AcqDate                          8278 non-null   datetime64[ns]
 16  AcqDealvaluethEUR                10064 non-null  object
 '''

df_modelo_4 = df_interesante_3

# LEAVE ONLY IPO
investments_end_in_ipo = df_modelo_4[df_modelo_4['IPO_dummy'] == 1]
print( '\n<o>\n info investments_end_in_ipo.info() \n' )
investments_end_in_ipo.info()
akap_pretty_print_df(investments_end_in_ipo, 20)

# LEAVE ONLY LAST 5 YEARS IPO

print('\n\n\n----------AGRUPAMIENTOS----------')
investments_end_in_ipo_grouped_by_investors \
  = investments_end_in_ipo.groupby(['InvestorID', 'InvestorName', 'InvestorNation', 'InvestorType']) \
                          .size() \
                          .reset_index(name='InvestorCount') \
                          .sort_values(by='InvestorCount', ascending=False) \
                          .reset_index(drop=True)
print( '\n<o>\n info investments_end_in_ipo_grouped_by_investors.info() \n' )
investments_end_in_ipo_grouped_by_investors.info()
akap_pretty_print_df(investments_end_in_ipo_grouped_by_investors, -5)

#['InvestorName', 'InvestorNation', 'InvestorType', 'CompanyNUTS3Name', 'CompanyNation', 'NACERev2corcodesdes']
investments_end_in_ipo_grouped_by_investors_and_company \
  = investments_end_in_ipo.groupby(['InvestorName', 'CompanyNUTS3Name']) \
                          .size() \
                          .reset_index(name='InvestorCount') \
                          .sort_values(by='InvestorCount', ascending=False) \
                          .reset_index(drop=True)
print( '\n<o>\n info investments_end_in_ipo_grouped_by_investors_and_company.info() \n' )
investments_end_in_ipo_grouped_by_investors_and_company.info()
akap_pretty_print_df(investments_end_in_ipo_grouped_by_investors_and_company, 10)

# Otros

In [None]:
# Check runtime vars
# dir() keeps trak of the declared variables in the runtime

[var for var in dir() if '_' != var[0]]

In [None]:
investor_count = df_investments['InvestorID'].value_counts().get('VCInvestor002335', 0)
investor_count