# Checkpoint 1
---
**Chloé Blanchard**

### Initial Setup
---

`Code Source: Professor Ritter, Columbia University`

In [None]:
import os
import bz2

import patsy
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import scipy
import scipy.sparse

from scipy.stats import gaussian_kde

from statistics import median
from statsmodels.formula.api import ols

In [None]:
from google.colab import drive  
drive.mount('/content/drive', force_remount= True)

In [None]:
model_dir = '/content/drive/MyDrive/APT-Portfolio/'

def sort_cols(test):
    return(test.reindex(sorted(test.columns), axis=1))

frames = {}

for year in [2003,2004,2005,2006,2007,2008,2009,2010]:
    fil = model_dir + "pandas-frames." + str(year) + ".pickle.bz2"
    frames.update(pickle.load( bz2.open( fil, "rb" ) ))

### Problem 1
---

In [None]:
for x in frames:
    frames[x] = sort_cols(frames[x])

covariance = {}
for year in [2003,2004,2005,2006,2007,2008,2009,2010]:
    fil = model_dir + "covariance." + str(year) + ".pickle.bz2"
    covariance.update(pickle.load( bz2.open(fil, "rb" ) ))

`Code Source: Professor Ritter, Columbia University`

### Problem 2
---

In [None]:
def wins(x,a,b):
    return(np.where(x <= a,a, np.where(x >= b, b, x)))

`Code Source: Professor Ritter, Columbia University`

### Problem 3
---

In [None]:
def clean_nas(df):
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    for numeric_column in numeric_columns:
        df[numeric_column] = np.nan_to_num(df[numeric_column])
    return df

`Code Source: Professor Ritter, Columbia University`

### Problem 4
---

In [None]:
def density_plot(data, title):
    density = gaussian_kde(data)
    xs = np.linspace(np.min(data),np.max(data),200) 
    density.covariance_factor = lambda : .25 
    density._compute_covariance() 
    plt.plot(xs,density(xs))
    plt.title(title)
    plt.show()

test = frames['20040102']
density_plot(test['Ret'], 'Daily return pre-winsorization')
density_plot(wins(test['Ret'],-0.2,0.2), 'Daily return winsorized')

`Code Source: Professor Ritter, Columbia University`

*Q: Why might it be important to re-run this after passing it through bounds of b = -0.2 and b = 0.2 when running a regression with "Ret" as the "Y variable"?*

**A: Winsorized estimators are usually more robust to outliers than their more standard forms. Before any kind of winsorization, the returns can be observed to have a very heavy right tail. We therefore want to run the data through a bounded domain of length 1 here so as to normalize our data and generate results that are robust to the particular skew to the RHS.**

# Checkpoint 2
---
**Chloé Blanchard**

### Problem 1
---

`Code Source: Professor Ritter, Columbia University`

In [None]:
def get_formula(factors, Y):
  
    L = ["0"]
    L.extend(factors)
    return Y + " ~ " + " + ".join(L)

In [None]:
def estimate_factor_returns(df):
  
   # rename df columns
    df = df.rename(columns = {'1DREVRSL' : 'ONEDREVERSAL', 'EARNYILD' : 'EARNYIELD', 'SENTMT' : 'SENTIMENT'})
    
    ## build universe based on filters
    estu = df.loc[df.IssuerMarketCap > 1e9].copy(deep=True)
    
    ## winsorize returns for fitting
    estu['Ret'] = wins(estu['Ret'], -0.25, 0.25)

    alp_factors = ['ONEDREVERSAL','EARNYIELD', 'VALUE', 'SENTIMENT'] 

    form = get_formula(alp_factors, "Ret")
    model = ols(form, data = estu)

    results = model.fit()
    return(results)

### Problem 2
---

In [None]:
facret = {}

for date in frames:
    facret[date] = estimate_factor_returns(frames[date]).params

In [None]:
alpha_factors = ['ONEDREVERSAL', 'EARNYIELD', 'VALUE', 'SENTIMENT']

my_dates = sorted(list(map(lambda date: pd.to_datetime(date, format='%Y%m%d'),frames.keys())))

facret_df = pd.DataFrame(index = my_dates) 

for dt in my_dates:

     for alp in alpha_factors:
        
        facret_df.at[dt, alp] = facret[dt.strftime('%Y%m%d')][alp]

facret_df.cumsum().plot()

`Code Source: Professor Ritter, Columbia University`

# Checkpoint 3
---
**Chloé Blanchard**

### Problem 1
---
* Write code that adds a column tracking prev. day holdings:
  
  * Initialized to zero weights for all assets when backtest first starts
  * Prev. day portfolio weights by estimating trade size *(therefore transaction costs)*

`Code Source: Professor Ritter, Columbia University`

In [None]:
prev_day = frames['Ret']
print(prev_day)

def (df):

    form = get_formula(alp_factors, "Ret")
    model = ols(form, data = estu)

    results = model.fit()
    return(results)
    
# rename df columns
    df = df.rename(columns = {'1DREVRSL' : 'ONEDREVERSAL', 'EARNYILD' : 'EARNYIELD', 'SENTMT' : 'SENTIMENT'})

In [None]:
def h_star_calc(risk_aver = 1.0e-6, Q, Q_T, specVar, alpha_vec, h_0, Lambda):
  
  def calc_star(h):

        tmp = 0.0
        tmp += 0.5 * risk_aver * np.sum( np.matmul(Q, h) ** 2 )

        # Specific variance is diagonal (matmul not needed)
        tmp += 0.5 * risk_aver * np.dot(h ** 2, specVar)
        tmp -= np.dot(h, alpha_vec)
        tmp += np.dot( (h - h_0) ** 2, Lambda)

        return(tmp)

        def grad_star(h):

        g = risk_aver * (h * specVar + np.matmul(Q_T, np.matmul(Q, h))) - alpha_vec + 2 * Lambda * (h - h_0)

        return np.asarray(g)

    optimizer_result = scipy.optimize.fmin_l_bfgs_b(calc_star, np.asarray(h_0), fprime = grad_star)

    return optimizer_result[0]

### Problem 2
---
* Build universe based on filters: select stock universe based on market cap, keeping companies w/ $1B+ valuation:

  * Include companies in prev day's holdings, even if on current day, the company no longer meets $1B valuation
  * Write code which performs indicated calculations

In [None]:
def get_universe(df):
    
    ## build universe based on filters
    estu = df.loc[df.IssuerMarketCap > 1e9].copy(deep=True)
    
    ## winsorize returns for fitting
    estu['Ret'] = wins(estu['Ret'], -0.25, 0.25)

    alp_factors = ['ONEDREVERSAL','EARNYIELD', 'VALUE', 'SENTIMENT'] 

    form = get_formula(alp_factors, "Ret")
    model = ols(form, data = estu)

    results = model.fit()
    return results

In [None]:
universe = pf_df['pd_weights' >= 1000000000]

### Problem 3
---
* Write function setdiff to compute set-wise difference of All Factor Set - Alpha Factor Set = Non-Alpha Factor Set: **"Risk factor"** Set

In [None]:
def set_diff(all_factors):
  risk_factors = universe_df - alpha_factors
  return risk_factors

setdiff(universe)

### Problem 4
---
* Write a direct analogue of the model.matrix function in R: model_matrix

In [None]:
# from rdrr.io - R documentation for stats::model.matrix()
# Note: need to verify source documentation as this might be incorrect

ff <- log(Volume) ~ log(Height) + log(Girth)
utils::str(m <- model.frame(ff, trees))
mat <- model.matrix(ff, m)

dd <- data.frame(a = gl(3,4), b = gl(4,1,12)) # balanced 2-way
options("contrasts") # typically 'treatment' (for unordered factors)
model.matrix(~ a + b, dd)
model.matrix(~ a + b, dd, contrasts = list(a = "contr.sum"))
model.matrix(~ a + b, dd, contrasts = list(a = "contr.sum", b = contr.poly))
m.orth <- model.matrix(~a+b, dd, contrasts = list(a = "contr.helmert"))
crossprod(m.orth) # m.orth is  ALMOST  orthogonal
# invalid contrasts.. ignored with a warning:
stopifnot(identical(
   model.matrix(~ a + b, dd),
   model.matrix(~ a + b, dd, contrasts.arg = "contr.FOO")))

In [None]:
def model_matrix(function, df):
    dmat = dmatrix(function, df)
    return dmat

industries = ['AERODEF', 'AIRLINES', 'ALUMSTEL', 'APPAREL', 'AUTO', 'BANKS', 
              'BEVTOB', 'BIOLIFE', 'BLDGPROD', 'CHEM', 'CNSTENG', 'CNSTMACH', 
              'CNSTMATL', 'COMMEQP', 'COMPELEC', 'COMSVCS', 'CONGLOM', 
              'CONTAINR', 'ELECEQP', 'ELECUTIL', 'FOODPROD', 'FOODRET', 
              'GASUTIL', 'HLTHEQP', 'HLTHSVCS', 'HOMEBLDG', 'HOUSEDUR', 
              'INDMACH', 'INDMOM', 'INSURNCE', 'INTERNET', 'LEISPROD', 
              'LEISSVCS', 'LIFEINS', 'LITREVRSL', 'MEDIA', 'MGDHLTH', 
              'MGMTQLTY', 'OILGSCON', 'OILGSDRL', 'OILGSEQP', 'OILGSEXP', 
              'PAPER', 'PHARMA', 'PRECMTL', 'PSLPROD', 'REALEST', 'RESTAUR', 
              'ROADRAIL', 'SEMICOND', 'SEMIEQP', 'SOFTWARE', 'SPLTYRET', 
              'SPTYCHEM', 'SPTYSTOR', 'TELECOM', 'TRADECO']

factors = ['BETA', 'MOMENTUM', 'VALUE', 'DISTRIB', 'DIVFIN', 'DIVYILD', 
                 'DWNRISK', 'EARNQLTY', 'EARNYILD','GROWTH', 'LEVERAGE', 
                 'LIQUIDITY', 'MOMENTUM', 'MULTUTIL', 'PROFIT', 'PROSPECT', 
                 'RESVOL', 'SEASON', 'SENTMT', 'SIZE', 'STREVRSL']

date = '20080808'
function = "Ret ~ 0 +" + "+".join(industries) + "+".join(factors)

X = model_matrix(function, frames[date])

### Problem 5
---
* Transform risk (volatility) data to decimal form then square to compute vector containing specific var for each stock in universe:
```
specVar = (0.01 * universe['SpecRisk']) ** 2
```



In [None]:
def risk_vect(universe):
  var_vect = []
  for frame in frames:
  # apply to each stock 
    specVar = (0.01 * universe['SpecRisk']) ** 2
    var_vect.append(specVar)
  return var_vect

### Problem 6
---
* Create factor covariance matrix: data provides precomputed factor covs
```
covariance = {'key = date' : 'value = df of factor covariance data'}
```
* Write function: input is covariance, computes diagonalized matrix
  * Rescale fcov from %$^2$ to decimal$^2$ squared $(fcov\times.01)$
  * Prevents optimizer from designing strategies that trade based on corr(factors) such as non-stationary correlations

In [None]:
def fcov_matrix(covariance):
  diag_matrix = covariance 
  diag_matrix = ((diag_matrix ** 0.5) * 0.01) ** 2
  return diag_matrix 

### Problem 7
---
* A large order, as it is executed in the market, usually exerts pressure on the price of the security, in the same direction as the trading 
* To measure stock order size: compare to avg daily volume (ADV) in stock
  * Don’t calc ADV, proxy by assuming each stock trades at ~$.01$ $\times$ total issuer-level market capitalization in a typical day
  * IssuerMarketCap is in pandas frames
* Large orders in this metric predicted to cause greater price impact
* Simplest predicted impact model assumes linear relationship btw ADV fraction and price impact
* Example:
  * If each $1$% ADV traded in $i^{th}$ security causes move of $0.1%$ in price $𝑃_𝑖$:
  * $\Delta P_i = \lambda_i(h_{i}^{*} - h_{i}^{0})$, where $\lambda_i = \frac{0.1}{ADV_i}$
  * $ℎ^∗$: optimal portfolio we seek
  * $ℎ^0$: portfolio we came into day with, i.e. **inventory positions**
* Write function to compute total transaction costs using this model
* Express the costs as a quadratic function with a diagonal matrix $\Lambda$ (Lambda)

In [None]:
def total_transcosts(df):
  trans_costs = []
  for frame in frames:
  # express costs as quadratic function w/ diag
    trans =  0.01 * frame['ISSUERMARKETCAP'] * (h_star[frame] - h_0[frame]
    trans_costs.append(trans)
  return trans_costs

### Problem 8
---
* Calculating portfolio risk attributable to risk factors $ℎ^𝑇 𝑋 𝐹 𝑋^𝑇 ℎ$ can become computationally infeasible/slow
* Using matrix factorization:
  * Choose order of matrix multiplications to avoid creating 𝑛×𝑛 matrix
  * Analogous to technique that makes L-BFGS optimizer so efficient
  * Simple matrix factorization: $Q := F^{1/2}X^T$
is defined s.t. $Q^T Q = X F X^T$
* Write code to compute Q and the factorization

In [None]:
def calc_Q(F, X, h):
  # compute Q, Q_T
  Q = F ** 0.5 * X.T
  Q_T = Q.T
  # matrix factorization
  pf = h.T * Q * Q_T * h
  return Q, pf

calc_Q(F1, X1, h1)

### Problem 9
---
* Objective function: factor risk + idiosyncratic risk - expected portfolio return + transaction costs:
 * $f(h) = \frac{1}{2} k h^T Q^T Q h + \frac{1}{2} k h^T D h - \alpha^T h + (h - h_0)^T \wedge (h - h_0)$

In [None]:
def obj_function(h, k, Q, alpha, D):
  #factor risk + idiosyncratic risk - E[portfolio return] + transaction costs
  obj_f = 0.5 * k * h.T * Q.T * Q * h + 0.5 * k * h.T * D * h - alpha.T * h + (h - h_0).T 
  return obj_f

### Problem 10
---
* Compile steps into a unified form_optimal_portfolio function 

In [None]:
universe = pf_df['pd_weights' >= 1000000000]

def form_optimal_portfolio(df, prev_day, risk_aver = 1.0e-6):

  ## merging vect of yesterday's holdings with new dataframe to create portfolio
  ## prev_day: vector of prev_day inventory pos. held over (holdings from yest.)
  df = df.merge(prev_day, how = 'left', on = 'ID')

  # removing na values/cleaning dataframe
  df = clean_nas(df)

  # changing specific risk values from 0 to median risk value (no inf divisions)
  df.loc[df['SpecRisk'] == 0]['SpecRisk'] = median(df['SpecRisk'])
  
  # creating variable for universe by calling get_uni on dataframe
  universe = get_universe(df)

  # adding date and outputting date value
  date = str(int(universe['DataDate'][1]))
  print(date, end = " ")

  # pull the list of factors in our portfolio
  all_factors = factors_from_names(list(universe))
  
  # compute setdiff: all factors - alpha factors = risk_factors 
  risk_factors = setdiff(all_factors, alpha_factors)
  
  h_0 = universe['h.opt.previous']

  X = model_matrix(get_formula(risk_factors, "SpecRisk"), universe)
  XT = X.transpose()

  specVar = (0.01 * universe['SpecRisk']) ** 2
  Fvar = diagonal_factor_cov(date, X)
  
  # get lambda matrix and X_alpha matrix using alpha factor set, Returns, uni
  Lambda = get_lambda(universe)
  X_alpha = model_matrix(get_formula(alpha_factors, "Ret"), data = universe)
  
  # Collapse alphas into one alpha vector (most trivial alpha combination)
  # In reality: done by some sophisticated stats not linear combination of af's

  alpha_vec = 1e-4 * rowSums(X_alpha)
  
  # Precompute matrix + transpose for efficiency
  Q = np.matmul(scipy.linalg.sqrtm(Fvar), XT);
  Q_T = Q.transpose();

  h_star = h_starred(risk_aversion, Q, Q_T, specVar, alpha_vec, h_0, Lambda)
  opt_portfolio = pd.DataFrame(data = {"ID" : universe['ID'], "h.opt" : h_star})
  
  risk_exposures = get_risk_exposures(X, X_T, h_star)
  
  portfolio_alpha_exposure = get_portfolio_alpha_exposure(X_alpha, h_star)
  total_transaction_costs = get_total_transaction_costs(h_0, h_star, Lambda)
  
  return {
      "opt.portfolio" : opt_portfolio,
      "risk.exposures" : risk_exposures,
      "alpha.exposures" : portfolio_alpha_exposure,
      "total.cost" : total_transaction_costs
  }

`Code Source: Professor Ritter, Columbia University`

# Checkpoint 4
---
**Chloé Blanchard**

### Problem 1
---
*Trade List: the vector of trades which, when added to the previous holdings, gives the new target portfolio.*

* Define a function which takes the previous holdings, and the result of our optimization function, and outputs a trade list
```
build_tradelist(prev_holdings, opt_result)
```

In [None]:
def build_tradelist(prev_holdings, opt_result, frames):
    
    frame = pd.DataFrame(frames)
    frame = frame[opt_result] = frame.set_index('DataDate')

    portfolio = pd.concat(frame, axis=1)
    return_stocks = portfolio.pct_change()

    number_of_portfolios = 2000
    RF = 0

    portfolio_returns = []
    portfolio_risk = []
    sharpe_ratio_port = []
    portfolio_weights = []

    for portfolio in range (number_of_portfolios):

        #generate a w random weight of lengt of number of stocks
        weights = np.random.random_sample((len(stocks)))

        weights = weights / np.sum(weights)
        annualize_return = np.sum((return_stocks.mean() * weights) * 252)
        portfolio_returns.append(annualize_return)

        #variance
        matrix_covariance_portfolio = (return_stocks.cov())*252
        portfolio_variance = np.dot(weights.T,np.dot(matrix_covariance_portfolio, weights))
        portfolio_standard_deviation= np.sqrt(portfolio_variance) 
        portfolio_risk.append(portfolio_standard_deviation)

        #sharpe_ratio
        sharpe_ratio = ((annualize_return- RF)/portfolio_standard_deviation)
        sharpe_ratio_port.append(sharpe_ratio)

        portfolio_weights.append(weights)

        portfolio_risk = np.array(portfolio_risk)
        portfolio_returns = np.array(portfolio_returns)
        sharpe_ratio_port = np.array(sharpe_ratio_port)

    return (portfolio_risk, portfolio_returns, sharpe_ratio_port, portfolio_weights)

`Code Source: https://codingandfun.com/portfolio-optimization-with-python/`

### Problems 2 & 3
---
*Ensure that your code is structured in such a way that, since n is large, we never form an n × n matrix either in the calculation of the objective function, the calculation of the gradient, or in the Hessian used by the L-BFGS descent method.*

* Write a python function which will walk through each day, calculating the optimal portfolio weights and trade list

*APT model entails associated reductions of the first and second moments of the asset returns. Using these reductions, the APT model allows us to attribute not only risk (variance), but also return or P&L. The portfolio’s one-period P&L is given by idiosyncratic plus factor contributions.*

* Write a python function which plots the cumulative sum of the idiosyncratic contribution and risk factor contributions over time

In [None]:
# time series plot of idiosyncratic risk (variance/stdev) + alpha factors + beta factors
# aggregate of long, short, net positions + gross market value + trade volume (USD)

plt.figure(figsize=(10, 5))
plt.scatter(portfolio_risk, portfolio_returns, c=portfolio_returns / portfolio_risk) 
plt.xlabel('volatility')
plt.ylabel('returns')
plt.colorbar(label='Sharpe ratio')

`Code Source: https://codingandfun.com/portfolio-optimization-with-python/`

### Problem 4
---
* Calculate the sum of long positions, short positions, net positions, gross market value, and amount of dollars traded

* Plot them all together as time series on the same graph

In [None]:
# plot of time series graph
# aggregate of long, short, net positions + gross market value + trade volume (USD)

porfolio_metrics = [portfolio_returns,portfolio_risk,sharpe_ratio_port, portfolio_weights] 

portfolio_dfs = pd.DataFrame(porfolio_metrics)
portfolio_dfs = portfolio_dfs.T
portfolio_dfs.columns = ['Port Returns','Port Risk','Sharpe Ratio','Portfolio Weights']

#convert from object to float the first three columns.
for col in ['Port Returns', 'Port Risk', 'Sharpe Ratio']:
    portfolio_dfs[col] = portfolio_dfs[col].astype(float)

    #portfolio with the highest Sharpe Ratio
    Highest_sharpe_port = portfolio_dfs.iloc[portfolio_dfs['Sharpe Ratio'].idxmax()]
    #portfolio with the minimum risk 
    min_risk = portfolio_dfs.iloc[portfolio_dfs['Port Risk'].idxmin()]

    #Highest_sharpe_port
    print(Highest_sharpe_port)
    print(min_risk)

`Code Source: https://codingandfun.com/portfolio-optimization-with-python/`