In [214]:
import pandas as pd
import numpy as np
import os

from datetime import datetime
import time
import re


In [215]:
file_path= '../Data/Quali_1.xlsx'
df_quali = pd.read_excel(file_path)

In [216]:
# Store file Last Mod Date and Time
modx = os.path.getmtime(file_path)
mod_file_date = datetime.fromtimestamp(modx)

### Filters
1. Only consider funds that currently oprerating
2. Considering Funds that are open for new investments
3. Considering Funds that can be invested by any person, are not exclusive to a party
4. Considering Funds with "Minimum Investment" <> "Not Informed"
5. Removing null benchmark rows
6. Removing rows where 'Leveraged Anbima' equals 'Non-classified'
7. Removing rows where 'Private Credit Anbima' equals 'Non-classified'

In [217]:
#Applying filters
# 1)
df_quali = df_quali[df_quali['Current Situation'] == "In normal operation"]
df_quali = df_quali[df_quali['Closing Date'].isnull()]
# 2) 
df_quali = df_quali[df_quali['Open to Investments'] == "Yes"]
# 3) 
df_quali = df_quali[df_quali['Qualified Investor'] == "No"]
df_quali = df_quali[df_quali['Exclusive Fund'] == "No"]
# 4)
df_quali = df_quali[df_quali['Minimum Investment'] != "Not informed"]
# 5) 
df_quali = df_quali[~df_quali['Benchmark'].isna()]
# 6) 
df_quali = df_quali[df_quali['Leveraged Anbima']!="Non-classified"]
# 7) 
df_quali = df_quali[df_quali['Private Credit Anbima']!="Non-classified"]


### Column Transformation

#### Handling Management Fee (Information resides in 2 columns)
 - Fund Management fee may oscilate, I will consider the .
 - When Management fee (Maximum) is not informed I´ll use regular Management fee.

In [218]:
# logic funtion to use in apply
def transform_fee(row):
    if row['Management Fee (Maximum)'] == "There is not":
        val = row['Management Fee']
    else:
        val = row['Management Fee (Maximum)']
    return val

#Saving results to data Frame
df_quali['Management Fee'] = df_quali.apply(transform_fee, axis=1)


#### Performance Fee Index
- Grouping performance fee value if normalized count < 1 %

In [219]:
df_quali['Performance Fee Reference Index'].value_counts().head(8)

There is not              1097
100% do CDI                420
100% do Ibovespa           152
100% do IBX                 26
100% do IMA-B 5             10
100% do IPCA + 6% a.a.       9
100% do IMA-B                8
100% do SMLL                 8
Name: Performance Fee Reference Index, dtype: int64

In [220]:
def group_categories(df,column_name,row_replace_Value = "Other",limit = 0.01):
    mantain_row_vals = []
    for y,x in df[column_name].value_counts(normalize=True).iteritems():
        if x >= limit:
            mantain_row_vals.append(y)
    df[column_name] = df[column_name].map(lambda x: x if x in mantain_row_vals else row_replace_Value)
    return df

In [221]:
# Grouping
df_quali = group_categories(df_quali,'Performance Fee Reference Index',"Other_performance_fee",0.01)

# Changing all Null performance to 0
df_quali['Performance Fee'] = df_quali['Performance Fee'].map(lambda x: 0 if x == "There is not" else x)

#### Benchmark Type
- Grouping Benchmark type if normalized count is < 1%
- Renaming some benchmark typos

In [222]:
# Grouping
df_quali = group_categories(df_quali,'Benchmark',"Other_Benchmark",0.01)

# Typos fix
df_quali['Benchmark'] = df_quali['Benchmark'].str.replace("Not informed","Not Informed")
df_quali['Benchmark'] = df_quali['Benchmark'].str.replace("Not defined","Not Informed")

### Creating column with Fund age in months

In [223]:
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [224]:
df_quali['age_months'] = df_quali['Start Date of the Fund'].map(lambda x: diff_month(mod_file_date,x))

### Selecting only funds with age > 24 months

In [225]:
df_quali = df_quali[df_quali['age_months']>=24]

### Droping Columns

In [226]:
drop_columns = ['Custodian','Administrator','Director','Headquarter (State)',
                'Headquarters (City)','Headquarters (Neighborhood)',
                'Fee Notes','Management Company','Management Company Name',
                'Management Fee Charging Frequency','Management Fee Provision',
                'Maximum Balance','Minimum Balance','Minimum Movement',
                'Minimum Redemption','Performance Fee Charging Frequency',
                'Performance Fee Criterion','Performance Fee Provision',
                'Performance Fee Water Mark','Purpose of the fund','Quantum Category',
                'Redemption Fee','Redemption Fee Exemption','Redemption NAV',
                'Transactions Notes', 'Type of Disclosure','Fund\'s Length',
                'Closing Date', 'Target Investor','Master',
                'Feeder','Current Situation','Management Fee Type',
                'Open to Investments','Foreign Investment Anbima',
                'Financial Management Fee','Exclusive Fund','Management Fee (Maximum)',
                'Availability of Recovered Resources','Lockup Period','Investment NAV',
                'Join Date','Start Date of the Fund','Qualified Investor',
                'Entidade de Previdência 3922_US', 'Entidade de Previdência 4444_US',
                'Entidade de Previdência 4661_US','Legal Classification'
               ]
df_quali.drop(drop_columns, axis = 1, inplace=True)

In [227]:
df_quali.iloc[9] # Checking single row

Name                               ADAM MACRO 1 FIC MULTIMERCADO
Benchmark                                   CDI (Interbank Rate)
CNPJ                                          24.119.419/0001-25
CVM Category                                         Multimarket
Leveraged Anbima                                             Yes
Management Fee                                             0.025
Minimum Investment                                         10000
Performance Fee                                              0.2
Performance Fee Reference Index                      100% do CDI
Portfolio Manager                                   Márcio Appel
Private Credit Anbima                                         No
Tax Classification                                     Long Term
Liquidity Ratios                                              22
age_months                                                    33
Name: 50, dtype: object

### Engineering Dummies with categorical features

In [229]:
print("PRINTING VALUE COUNTS FOR CATEGORICAL COLUMNS:\n")
print('Tax Classification:\n',df_quali['Tax Classification'].value_counts(),'\n')
print('Private Credit Anbima:\n',df_quali['Private Credit Anbima'].value_counts(),'\n')
print('Leveraged Anbima:\n',df_quali['Leveraged Anbima'].value_counts(),'\n')
print('CVM Category:\n',df_quali['CVM Category'].value_counts(),'\n')
print('Performance Fee Reference Index:\n',df_quali['Performance Fee Reference Index'].value_counts(),'\n')
print('Benchmark:\n',df_quali['Benchmark'].value_counts(),'\n')



PRINTING VALUE COUNTS FOR CATEGORICAL COLUMNS:

Tax Classification:
 Long Term     958
Equities      370
Short Term     52
Exempt          9
Name: Tax Classification, dtype: int64 

Private Credit Anbima:
 No                913
Not applicable    317
Yes               159
Name: Private Credit Anbima, dtype: int64 

Leveraged Anbima:
 No     987
Yes    402
Name: Leveraged Anbima, dtype: int64 

CVM Category:
 Fixed Income    568
Multimarket     435
Equities        350
FX               34
Index Market      2
Name: CVM Category, dtype: int64 

Performance Fee Reference Index:
 There is not             921
100% do CDI              284
100% do Ibovespa         114
Other_performance_fee     48
100% do IBX               22
Name: Performance Fee Reference Index, dtype: int64 

Benchmark:
 CDI (Interbank Rate)    720
Not Informed            339
Ibovespa                181
Other_Benchmark          69
IBX                      37
Dollar                   28
IMA-B                    15
Name: Benchma

In [230]:
## Manually converting column Legal Classification to dummies
df_quali['CVM Category_Fixed Income'] = np.where(df_quali['CVM Category'] == 'Fixed Income',1,0)
df_quali['CVM Category_Multimarket'] = np.where(df_quali['CVM Category'] == 'Multimarket',1,0)
df_quali['CVM Category_Equities'] = np.where(df_quali['CVM Category'] == 'Equities',1,0)
df_quali['CVM Category_FX'] = np.where(df_quali['CVM Category'] == 'FX',1,0)
df_quali.drop('CVM Category',axis=1,inplace=True)

In [231]:
columns_to_dummy = ['Tax Classification','Private Credit Anbima','Leveraged Anbima',
                   'Performance Fee Reference Index','Benchmark']
df_quali = pd.get_dummies(df_quali, columns=columns_to_dummy,drop_first=True)

# EDA Quantitative Features

In [232]:
file_path= '../Data/Quant_1.xlsx'
df_quant = pd.read_excel(file_path)

### Renaming Columns

In [233]:
# Function to correct col names
def fix_col_names(col_str):
    if col_str == "CNPJ":
        return col_str     
    elif col_str[::-1][1]==".":
        replaced = col_str.replace('.1', '_12m')
        replaced = replaced.replace('.2', '_24m')
    else:
        replaced = col_str + '_6m'     
#     print(col_str[::-1][1],col_str,replaced)
    return replaced

In [234]:
# Renaming 2 last assets col
df_quant.rename({'Patrimônio Líquido final da série': 'Last_Assets',
                 'Patrimônio Líquido final da série.1': 'Last_Assets_date'},
                axis='columns',inplace=True)
# Adding time period to column names
cols = df_quant.columns
df_quant.columns = [fix_col_names(cols[pos]).replace(" ","_") 
                    if pos > 2 else cols[pos].replace(" ","_")  
                    for pos in range(len(cols))]


In [235]:
df_quant.columns # Checking column names

Index(['Name', 'Last_Assets', 'Last_Assets_date', 'Assets_Flow_6m',
       'Assets_Flow_12m', 'Assets_Flow_24m', 'Average_Assets_6m',
       'Average_Assets_12m', 'Average_Assets_24m', 'Return_6m', 'Return_12m',
       'Return_24m', 'Volatility_6m', 'Volatility_12m', 'Volatility_24m',
       'Excess_Return_-_CDI_Opening_6m', 'Excess_Return_-_CDI_Opening_12m',
       'Excess_Return_-_CDI_Opening_24m', 'Excess_Return_-_Ibovespa_6m',
       'Excess_Return_-_Ibovespa_12m', 'Excess_Return_-_Ibovespa_24m',
       'Excess_Return_-_Dollar_6m', 'Excess_Return_-_Dollar_12m',
       'Excess_Return_-_Dollar_24m', 'CNPJ'],
      dtype='object')

### Filtering & Merging & Fixing Data Types
1. Only funds selected in the Qualitative Data Frame
2. Rows with no null value.
3. Creating Column with Manager Name
4. Converting to numeric

In [236]:
#1)
df_quant = df_quant[df_quant['CNPJ'].isin(df_quali['CNPJ'])]
# df_quant.isnull().sum().sum() # null check

#2)
df_quant = df_quant[~df_quant['Excess_Return_-_CDI_Opening_24m'].isnull()]
# df_quant.isnull().sum().sum()   # null check

# 3)
df_quant = pd.merge(df_quant, df_quali[['CNPJ','Portfolio Manager']], on='CNPJ')

# 4)
string_columns = ["Last_Assets_date","Name","CNPJ","Portfolio Manager"]
numeric_cols = df_quant[df_quant.columns[~df_quant.columns.isin(string_columns)]].columns
df_quant[numeric_cols] = df_quant[numeric_cols].apply(pd.to_numeric)
# df_quant.dtypes  # dtypes check

### Feature Engineering columns with Manager Information

In [237]:
managers = pd.DataFrame(df_quant['Portfolio Manager'].value_counts())
managers.columns = ["#_Funds_managed"]

In [238]:
x = df_quant[df_quant['Portfolio Manager']=='Eduardo Alves de Castro']['Last_Assets'].values

In [239]:
managers['Manager_avg_Assets'] = [np.average(df_quant[df_quant['Portfolio Manager']
                         ==manage_name]['Last_Assets'].values) for manage_name
                         in managers.index.values]

In [240]:
df_quant["#_Funds_managed"] = [managers.loc[manager]['#_Funds_managed'] for manager
                              in df_quant['Portfolio Manager']]

In [241]:
df_quant["Manager_avg_Assets"] = [managers.loc[manager]['Manager_avg_Assets'] for manager
                              in df_quant['Portfolio Manager']]

# Joining Qualitative and Quantitative Data Frames

In [242]:
df_quant.shape # Dimention check

(1376, 28)

In [243]:
df_quali.shape # Dimention check

(1389, 28)

In [244]:
df_quant.drop(['Portfolio Manager','Name'], axis = 1, inplace=True)
final_df = pd.merge(df_quali,df_quant,on = 'CNPJ')

In [245]:
final_df.shape # Dimention check

(1376, 53)

## Saving final csv

In [246]:
final_df.to_csv('./clean_df.csv',index=False)