# Financial Analytics - TP Final
## Integrantes: 
### - *Franco Ferrari*
### - *Aldo Escobar*
### - *Damian Izanotegui*
### - *Nahuel Sanchez*

## Analisis Exploratorio

Cargamos los datasets, computamos las medias móviles, y ploteamos la apariencia inicial del dataset

In [None]:
# Imports
import pandas as pd
import yfinance as yf
import numpy as np
from pandas_datareader import data as pdr
import datetime
import matplotlib.pyplot as plt
import numba as nb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report, roc_auc_score

import os
import sys
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
from notebooks.mlfinlab.corefns.core_functions import CoreFunctions
from notebooks.mlfinlab.corefns.financial_functions import FinancialFunctions

# Functions

#Cargamos ambos files y los unimos, usando como indices en ambos las fechas.
def load_and_join():
    mtum = yf.Ticker("MTUM")
    stocks = ["MTUM"]
    start = datetime.datetime(2000,11,30)
    end = datetime.datetime(2019,11,30)
    
    yf.pdr_override()
    
    df_etf = pdr.get_data_yahoo(stocks, start=start, end=end)
    
    df = pd.read_excel('https://images.aqr.com/-/media/AQR/Documents/Insights/Data-Sets/Century-of-Factor-Premia-Monthly.xlsx',
                      header =18, nrows = 1220)
    
    df['Date'] =  pd.to_datetime(df['Date'])
    
    
    df = df.set_index('Date')
    df = df[['Equity indices Value','Equity indices Momentum','Equity indices Carry','Equity indices Defensive']]
    
    df_final = df_etf.merge(df, how='left',left_index=True,right_index=True)
    
    return df_final

#Extendemos los datos mensuales a los registros diarios. (Ejemplo: todos los datos de enero, tomaran el valor monthly del 31/01)
def fill_joined_missing_fields(df):
    for i in range(1,len(df)+1):
        if np.isnan(df.iloc[-i,9]):
                df.iloc[-i,6] = df.iloc[-i+1,6]
                df.iloc[-i,7] = df.iloc[-i+1,7]
                df.iloc[-i,8] = df.iloc[-i+1,8]
                df.iloc[-i,9] = df.iloc[-i+1,9]
         

#Introducimos labels al dataset, calculando medias 50 y 200 dias.  
def labeling_df(df):
    df['50_days_average'] = df.iloc[:,3].rolling(window=10).mean()
    df['200_days_average'] = df.iloc[:,3].rolling(window=30).mean()
    df.loc[df['50_days_average'] >= df['200_days_average'], 'Buy/Sell'] = -1 #Si la media de corto plazo supera a la de largo, es posicion de sell
    df.loc[df['50_days_average'] < df['200_days_average'], 'Buy/Sell'] = 1 #Si la media de corto plazo esta por debajo de la de largo, es posicion de buy
    return df
 
#Visualizacion de la evolucion de precios y las medias.
def visualize_close_50_200(df):
    plt.plot(df['Close'])
    plt.plot(df['50_days_average'])
    plt.plot(df['200_days_average'])
    plt.legend(['Close','50_days_avg','200_days_avg'])
    plt.title('Evolution of MTUM ETF over time')
    plt.show()
    
dataset = load_and_join()
fill_joined_missing_fields(dataset)

In [None]:
df = dataset.copy();

#Ploteando series
df = labeling_df(df);
visualize_close_50_200(df);

## Feature Engineering

< Introducir Notas >

In [None]:
## Codigo

## Meta-Labeling

< Introducir Notas >

In [None]:
## Codigo

In [None]:
def plotROC(rf):
    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    y_pred = rf.predict(X_test)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
    print(classification_report(y_test, y_pred))
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    auc = roc_auc_score(y_true = y_test, y_score = y_pred)
    print(f"auc: {auc}")
    
plotROC(rf)

## Diferenciacion fraccionaria

Encontramos que el mejor orden de diferenciación es 0.05

In [None]:
#Consigue los weights para la diferenciacion!
def getWeights_FFD(d,size):
    w=[1.]
    for k in range(1,size):
        w_=-w[-1]/k*(d-k+1)
        w.append(w_)
    w=np.array(w[::-1]).reshape(-1,1)
    return w

#Funcion aux para pesos de FFD
def plotWeights(dRange,nPlots,size):
    w=pd.DataFrame()
    for d in np.linspace(dRange[0],dRange[1],nPlots):
        w_=getWeights_FFD(d,size=size)
        w_=pd.DataFrame(w_,index=range(w_.shape[0])[::-1],columns=[d])
        w=w.join(w_,how='outer')
    ax=w.plot()
    ax.legend(loc='upper right');plt.show()
    return

#Diferenciamos la serie! d es el orden de diferenciacion. "Thres" (threshold) maneja la acceptabilidad de las exclusiones. No modificar.
#to do
def fracDiff(series,d,thres=0.01):

    w=getWeights_FFD(d,series.shape[0])

    w_=np.cumsum(abs(w))
    w_/=w_[-1]
    skip=w_[w_>thres].shape[0]

    df={}
    for name in series.columns:
        seriesF,df_=series[[name]].fillna(method='ffill').dropna(),pd.Series()
        for iloc in range(skip,seriesF.shape[0]):
            loc=seriesF.index[iloc]

            df_[loc]=np.dot(w[-(iloc+1):,:].T,seriesF.loc[:loc])[0,0]
        df[name]=df_.copy(deep=True)
    df=pd.concat(df,axis=1)
    return df

#Funcion para buscar el mejor d
def plotMinFFD(df):
    from statsmodels.tsa.stattools import adfuller
    import numpy.ma as ma
    out=pd.DataFrame(columns=['adfStat','pVal','lags','nObs','95% conf','corr'])
    for d in np.linspace(0,1,21):
        df1=np.log(df[['Close']]).resample('1D').last() # Pasar a observaciones diarias
        df2=fracDiff(df1,d,thres=.01)
        corr = ma.corrcoef(ma.masked_invalid(df1.loc[df2.index,'Close']), ma.masked_invalid(df2['Close']))[0,1]
        df2=adfuller(df2['Close'],maxlag=1,regression='c',autolag=None)
        out.loc[d]=list(df2[:4])+[df2[4]['5%']]+[corr] # Aportar valores criticos
    out[['adfStat','corr']].plot(secondary_y='adfStat')
    plt.axhline(out['95% conf'].mean(),linewidth=1,color='r',linestyle='dotted')
    plt.show()
    return out

#Diferenciacion fraccionaria: Buscando el mejor d, d* = 0.1
plt.figure(1)
out = plotMinFFD(dataset)

#Usando la diferenciacion con d = 0.05
df_ffd = fracDiff(dataset,0.05)  

## Sample Weights

Extraemos pesos por retorno y por tiempo.

In [None]:
#Obtiene los factores por retorno para restar importancia a las observaciones.
def return_weight(df,price_column):
    returns = []
    price_array = list(df[price_column])
    for i in range(len(price_array)):
        if i != (len(price_array)-1):
            returns.append(abs(price_array[i+1]-price_array[i]))
    weights = []
    max_return = max(returns)
    for i in range(len(returns)):
        weights.append(returns[i]/max_return)
    return weights

#Obtiene los factores de tiempo para restar importancia a las observaciones.
def getTimeDecay(tW,clfLastW=1.):
    # apply piecewise-linear decay to observed uniqueness (tW)
    # newest observation gets weight=1, oldest observation gets weight=clfLastW
    clfW=tW.sort_index().cumsum()
    if clfLastW>=0:slope=(1.-clfLastW)/clfW.iloc[-1]
    else:slope=1./((clfLastW+1)*clfW.iloc[-1])
    const=1.-slope*clfW.iloc[-1]
    clfW=const+slope*clfW
    clfW[clfW<0]=0
    return clfW

#Combina multiplicativamente ambos pesos para llegara a ponderadores finales.
def final_weight(df,price_column,factor):
    return_weights = return_weight(df,price_column)
    time_weights = list(getTimeDecay(df[price_column], clfLastW=factor))[:-1]
    model_weight = []
    for i in range(len(return_weights)):
        model_weight.append(return_weights[i]*time_weights[i])
    max_w = max(model_weight)
    final_weights = []
    for i in range(len(model_weight)):
        final_weights.append(model_weight[i]/max_w)            
    return final_weights

model_weights = final_weight(dataset,'close',0)

## Cross Validation

< Introducir Notas >

In [None]:
## Codigo

## Feature Importance

< Introducir Notas >

In [None]:
#TODO: mejorar graficas, mattplotlib sos muy feo
def plotImportance(rf):
    # Feature Importance
    title = 'Feature Importance:'
    figsize = (15, 5)

    feat_imp = pd.DataFrame({'Importance':rf.feature_importances_})    
    feat_imp['feature'] = X.columns
    feat_imp.sort_values(by='Importance', ascending=False, inplace=True)
    feat_imp = feat_imp

    feat_imp.sort_values(by='Importance', inplace=True)
    feat_imp = feat_imp.set_index('feature', drop=True)
    feat_imp.plot.barh(title=title, figsize=figsize)
    plt.xlabel('Feature Importance Score')
    plt.show()
    
plotImportance(rf)

## Backtesting

Calculamos:
1. Retorno del Portfolio
2. Trades Realizados
3. Ratio of Longs
4. Sharpe Ratio

In [None]:
def back_test(df,mode,start_date,end_date, risk_free = 0.0):
    df1 = df.loc[start_date:end_date]
    ''' Modos disponibles:
        1- 'Simple' -> Simple: Toma la performance de una estrategia "Buy and Hold". Compra en t=1, vende en t=T (Ultimo dia)
        2- 'Signal' -> Señal: Compra y vende segun el campo 'Buy/Sell'. Size es 1 siempre.
        3- 'BetS' -> Bet Sizing: Usa la señal del campo Buy/Sell y la pondera por la probabilizada del campo 'BetSize'. '''
        
    if mode == 'Simple':
        print('Modo Simple!')
        opening_price = df1['Close'].first('D')[0]
        closing_price = df1['Close'].last('D')[0]
        result = (closing_price - opening_price) / opening_price
        # print('Opening Price: $ {:.2f}'.format(opening_price))
        # print('Closing Price: $ {:.2f}'.format(closing_price))
        print('Rate of Return: {:.2%}'.format(result))
        
    elif mode == 'Signal':
        print('Modo via señales!')
        trades = pd.DataFrame(columns=['Opening_Price','Closing_Price','Return','Type'])
        position = 0
        for i in range(len(df1)):
            if position == 0:
                position = df1.iloc[0,12]
                open_price = df1.iloc[0,3]
            elif position == 1 and (df1.iloc[i,12] == -1 or i == (len(df1)-1)):
                # Cambio de Buy a Sell
                result = (df1.iloc[i,3] - open_price) / open_price
                trades = trades.append({'Opening_Price':open_price,'Closing_Price':df1.iloc[i,3],'Return':result,'Type':'Long'}, ignore_index=True)
                open_price = df1.iloc[i,3]
                
            elif position == -1 and (df1.iloc[i,12] == 1 or i == (len(df1)-1)):
                # Cambio de Sell a Buy
                result = (open_price - df1.iloc[i,3]) / open_price
                trades = trades.append({'Opening_Price':open_price,'Closing_Price':df1.iloc[i,3],'Return':result,'Type':'Short'}, ignore_index=True)
                open_price = df1.iloc[i,3]
                
            position = df1.iloc[i,12]
        
        # print('*********** Trades ejecutados: **************')
        # print('*********************************************')
        # for i in range(len(trades)):
        #     print('Trade {}, tipo: {}'.format(i,trades.iloc[i,3]))
        #     print('Abrio al precio de $ {:.2f} y cerro en $ {:.2f}'.format(trades.iloc[i,0],trades.iloc[i,1]))
        #     print('Retorno: {:.2%}'.format(trades.iloc[i,2]))
        #     print('*********************************************')
        
        print('Resultado del Portfolio: {:.2%}'.format(trades['Return'].sum()))
        print('Cantidad de trades: 1')
                                                 
    elif mode == 'BetS':
        print('Modo via señales y bet sizing!')
        trades = pd.DataFrame(columns=['Opening_Price','Closing_Price','Return','Type','Bet Size'])
        position = 0
        trade_count = 0
        long_count = 0
        for i in range(len(df1)):
            if position == 0:
                position = df1.iloc[0,12]
                open_price = df1.iloc[0,3]
                bet_size = df1.iloc[0,13]
            elif position == 1 and (df1.iloc[i,12] == -1 or i == (len(df1)-1)):
                # Cambio de Buy a Sell
                result = ((df1.iloc[i,3] - open_price) / open_price)*bet_size
                trades = trades.append({'Opening_Price':open_price,'Closing_Price':df1.iloc[i,3],'Return':result,'Type':'Long','Bet Size':bet_size}, ignore_index=True)
                open_price = df1.iloc[i,3]
                trade_count += 1
                long_count += 1
                
            elif position == -1 and (df1.iloc[i,12] == 1 or i == (len(df1)-1)):
                # Cambio de Sell a Buy
                result = ((open_price - df1.iloc[i,3]) / open_price)*bet_size
                trades = trades.append({'Opening_Price':open_price,'Closing_Price':df1.iloc[i,3],'Return':result,'Type':'Short','Bet Size':bet_size}, ignore_index=True)
                open_price = df1.iloc[i,3]
                trade_count += 1
                
                
            position = df1.iloc[i,12]
            bet_size = df1.iloc[i,13]
        
        print('*********** Trades ejecutados: **************')
        print('*********************************************')
        for i in range(len(trades)):
            print('Trade {}, tipo: {}'.format(i,trades.iloc[i,3]))
            print('Abrio al precio de $ {:.2f} y cerro en $ {:.2f}, con un size de {:.2f}'.format(trades.iloc[i,0],trades.iloc[i,1],trades.iloc[i,4]))
            print('Retorno: {:.2%}'.format(trades.iloc[i,2]))
            print('*********************************************')
        
        print('Resultado del Portfolio: {:.2%}'.format(trades['Return'].sum()))
        print('Cantidad de trades: {}'.format(trade_count))
        print('Ratio of Longs: {:.2%}'.format(long_count/trade_count))
        print('Sharpe Ratio: {:.2%}'.format((trades['Return'].sum()-risk_free)/trades['Return'].std()))
        
        
    else:
        print('Modo incorrecto!')
            
start_date = '2018-01-02
end_date = '2020-01-02'
back_test(df,'Simple',start_date,end_date)
print('---------------------------------------')
# back_test(df,'Signal','2018-01-02','2019-01-02')
print('---------------------------------------')
back_test(df,'BetS',start_date,end_date)

## Conclusiones

< Introducir Notas >

In [None]:
## Codigo