In [1]:
#pip install regressors

In [2]:
#pip install pandas_datareader

In [3]:
import pandas_datareader as wb
from pandas_datareader import data
import yfinance as yf
import requests 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from lxml import html
import datetime
from datetime import datetime
import statsmodels.api as sm
from scipy.stats import norm
from tqdm.notebook import tqdm
import os
from sklearn import linear_model

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from scipy import stats
from regressors import stats as st

In [5]:
def get_treasury_rate():
    # web-scrape the data: Daily Treasury Yield Curve Rates
    url = 'https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield'
    r = requests.get(url)
    html = r.text

    soup = BeautifulSoup(html)
    table = soup.find('table', {"class": "t-chart"})
    rows = table.find_all('tr')
    data = []
    for row in rows[1:]:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])

        result = pd.DataFrame(data, columns=['Date', '1 Mo', '2 Mo', '3 Mo', '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', '30 Yr'])

    return(result)



def return_df(close_price_df):
        
    return_df = close_price_df.pct_change().apply(lambda x: np.log(1+x))
    
    return return_df

def dummies_beater(all_returns,Rf, option = "return"):
    
    all_returns = all_returns.dropna(how="all")
    Rm = all_returns["SPY"]
    Rm = Rm.fillna(0)
    comps_R = all_returns.drop(columns = ["SPY"])
    comps_R = comps_R.fillna(0)
    
    if option == "return":
        comps_R_excess = comps_R.subtract(Rm.values, axis=0)
        dummies_Return_beaters = comps_R_excess.copy()
        dummies_Return_beaters[dummies_Return_beaters >= 0] =1
        dummies_Return_beaters[dummies_Return_beaters < 0] =0
        
    elif option == "sharpe":
        market_volatility = Rm.std()
        comps_volatility = comps_R.std()
        comps_sharpe = (comps_R - Rf)/comps_volatility
        market_sharpe = (Rm - Rf)/market_volatility
        sharpe_excess = comps_sharpe.subtract(market_sharpe, axis=0)
        dummies_Return_beaters = sharpe_excess.copy()
        dummies_Return_beaters[dummies_Return_beaters >= 0] =1
        dummies_Return_beaters[dummies_Return_beaters < 0] =0
        
    return dummies_Return_beaters

def port_simulation(stock_closed_df):
    
    pct_change_df = stock_closed_df.pct_change()
    ind_er = pct_change_df.mean()
    return_df = pct_change_df.apply(lambda x: np.log(1+x))
    cov_matrix = return_df.cov()
    
    p_ret = [] 
    p_vol = [] 
    p_weights = [] 
    
    
    num_assets = len(stock_closed_df.columns)
    num_portfolios = 10000

    
    for portfolio in range(num_portfolios):
        weights = np.random.random(num_assets)
        weights = weights/np.sum(weights)
        p_weights.append(weights)
        returns = np.dot(weights, ind_er)  
        p_ret.append(returns)
        var = cov_matrix.mul(weights, axis=0).mul(weights, axis=1).sum().sum()
        sd = np.sqrt(var)  
        p_vol.append(sd)
    
    data = {'Returns':p_ret, 'Volatility':p_vol}
 
    for counter, symbol in enumerate(stock_closed_df.columns.tolist()):
        data[symbol] = [w[counter] for w in p_weights]
        portfolios  = pd.DataFrame(data)
        
    return portfolios

def portfolios_return_df(equity_selection, all_return):
    
    simulation = port_simulation(equity_selection)
    weight_possibility = simulation.drop(columns=["Returns", "Volatility"])
    portfolio_return_df = pd.DataFrame(columns = weight_possibility.index)
    port_comps_return = all_return[weight_possibility.columns]
    
    for portfolio in tqdm(weight_possibility.index):
        portfolio_return = weight_possibility.iloc[portfolio].mul(port_comps_return).T.sum()
        portfolio_return_df[portfolio] = portfolio_return
    
    return portfolio_return_df


In [6]:
def FF_assign_label(index_comp_info):
        
        
    index_comp_info["bookToMarket"] = 1/index_comp_info["PB_ratio"]
    index_comp_info["Small_Big_Cap"] = index_comp_info["mkt_cap"].map(lambda x: "B" if x >= index_comp_info["mkt_cap"].median() else "S")
    
    lower, upper = index_comp_info["bookToMarket"].quantile([0.3, 0.7])
    index_comp_info["HML_BP"] = index_comp_info["bookToMarket"].map(lambda x: "H" if x >= upper else "M")
    index_comp_info["HML_BP"] = index_comp_info.apply(lambda row: "L" if row["bookToMarket"] <= lower else row["HML_BP"], axis = 1)
    
    lower_roe, upper_roe = index_comp_info["ROE"].quantile([0.3, 0.7])
    index_comp_info["RNW_ROE"] = index_comp_info["ROE"].map(lambda x: "R" if x >= upper_roe else "N")
    index_comp_info["RNW_ROE"] = index_comp_info.apply(lambda row: "W" if row["ROE"] <= lower_roe else row["RNW_ROE"], axis = 1)
    
    lower_invest, upper_invest = index_comp_info["Asset_growth"].quantile([0.3, 0.7])
    index_comp_info["ANC_investment"] = index_comp_info["Asset_growth"].map(lambda x: "A" if x >= upper_invest else "N")
    index_comp_info["ANC_investment"] = index_comp_info.apply(lambda row: "C" if row["Asset_growth"] <= lower_invest else row["ANC_investment"], axis = 1)
            
    return index_comp_info
        
def FF_factor_classifier(index_comp_info_with_label):
        
    data = index_comp_info_with_label
    Small_Low = data.query('(Small_Big_Cap=="S") & (HML_BP=="L")')
    Small_Mid = data.query('(Small_Big_Cap=="S") & (HML_BP=="M")')
    Small_High = data.query('(Small_Big_Cap=="S") & (HML_BP=="H")')
    
    Small_Weak = data.query('(Small_Big_Cap=="S") & (RNW_ROE=="W")')
    Small_Neutral_Profit = data.query('(Small_Big_Cap=="S") & (RNW_ROE=="N")')
    Small_Robust = data.query('(Small_Big_Cap=="S") & (RNW_ROE=="R")')
    
    Small_Conservative =  data.query('(Small_Big_Cap=="S") & (ANC_investment=="C")')
    Small_Neutral_Invest =  data.query('(Small_Big_Cap=="S") & (ANC_investment=="N")')
    Small_Aggresive =  data.query('(Small_Big_Cap=="S") & (ANC_investment=="A")')
    
    Big_Low = data.query('(Small_Big_Cap=="B") & (HML_BP=="L")')
    Big_Mid = data.query('(Small_Big_Cap=="B") & (HML_BP=="M")')
    Big_High = data.query('(Small_Big_Cap=="B") & (HML_BP=="H")')
    
    Big_Weak = data.query('(Small_Big_Cap=="B") & (RNW_ROE=="W")')
    Big_Neutral_Profit = data.query('(Small_Big_Cap=="B") & (RNW_ROE=="N")')
    Big_Robust = data.query('(Small_Big_Cap=="B") & (RNW_ROE=="R")')

    Big_Conservative =  data.query('(Small_Big_Cap=="B") & (ANC_investment=="C")')
    Big_Neutral_Invest = data.query('(Small_Big_Cap=="B") & (ANC_investment=="N")')
    Big_Aggresive =  data.query('(Small_Big_Cap=="B") & (ANC_investment=="A")')
    
    each_groups_list = [Small_Low, Small_Mid, Small_High, 
                            Small_Weak, Small_Neutral_Profit, Small_Robust,
                            Small_Conservative, Small_Neutral_Invest, Small_Aggresive,
                            Big_Low, Big_Mid,Big_High,
                            Big_Weak, Big_Neutral_Profit, Big_Robust,
                            Big_Conservative, Big_Neutral_Invest, Big_Aggresive]
        
    return each_groups_list
    
def FF_classes_return(market_components_return, list_of_group_info, axis=True):
        
    groups_names = ["Small_Low", "Small_Mid", "Small_High",
                        "Small_Weak", "Small_Neutral_Profit", "Small_Robust",
                        "Small_Cons", "Small_Neutral_Invest", "Small_Aggr",
                        "Big_Low", "Big_Mid","Big_High",
                        "Big_Weak", "Big_Neutral_Profit", "Big_Robust",
                        "Big_Cons", "Big_Neutral_Invest", "Big_Aggr"]
    
    df_groups = pd.DataFrame(columns = groups_names)
    
    counter = 0
    
    for group in list_of_group_info:
    
        group_cap = group["mkt_cap"].T
        group_total_cap = group["mkt_cap"].sum()
        group_cap_multi_return = group_cap*market_components_return[list(group.index)]
        
        if axis == True:
            df_groups[groups_names[counter]] = group_cap_multi_return.apply(lambda row: row.sum()/group_total_cap, axis=1)
        
        else:
            groups_index_return = group_cap_multi_return.sum()/group_total_cap
            df_groups[groups_names[counter]] = [groups_index_return]
    
        counter += 1
                
    return df_groups
    
def FF_calc_factors(classes_return_df, df = True):
    
    factor_name = ["SMB", "HML", "RMW", "CMA"]
    
    SMB_BP = (classes_return_df["Small_Low"] + classes_return_df["Small_Mid"] 
                      + classes_return_df["Small_High"]) - (classes_return_df["Big_Low"]
                      + classes_return_df["Big_Mid"] + classes_return_df["Big_High"])/3
    
    SMB_PFT = (classes_return_df["Small_Weak"] + classes_return_df["Small_Neutral_Profit"] 
                      + classes_return_df["Small_Robust"]) - (classes_return_df["Big_Weak"]
                      + classes_return_df["Big_Neutral_Profit"] + classes_return_df["Big_Robust"])/3
    
    SMB_INV = (classes_return_df["Small_Cons"] + classes_return_df["Small_Neutral_Invest"] 
                      + classes_return_df["Small_Aggr"]) - (classes_return_df["Big_Cons"]
                      + classes_return_df["Big_Neutral_Invest"] + classes_return_df["Big_Aggr"])/3
    
    if df == True:
        
        FF_factors_data = pd.DataFrame(columns = factor_name)
        
    
    FF_factors_data["SMB"] = (SMB_BP + SMB_PFT + SMB_INV)/3
    
    FF_factors_data["HML"] = (classes_return_df["Small_High"] + classes_return_df["Big_High"]
                      - (classes_return_df["Small_Low"] + classes_return_df["Big_Low"])) / 2
    
    FF_factors_data["RMW"] = (classes_return_df["Small_Robust"] + classes_return_df["Big_Robust"]
                      - (classes_return_df["Small_Weak"] + classes_return_df["Big_Weak"])) / 2
    
    FF_factors_data["CMA"] = (classes_return_df["Small_Cons"] + classes_return_df["Big_Cons"]
                      - (classes_return_df["Small_Aggr"] + classes_return_df["Big_Aggr"])) / 2
        
    return FF_factors_data

    
def FF_regress(FF_factors_df, target_comp_risk_premium):
            
    y = target_comp_risk_premium
    X = FF_factors_df
    model = linear_model.LinearRegression()
    model.fit(X.astype(float), y.astype(float))
    params = np.append(model.intercept_,model.coef_)
    predictions = model.predict(X)

    newX = pd.DataFrame({"Constant":np.ones(len(X))}).join(pd.DataFrame(X.reset_index(drop=True)))
    MSE = (sum((y-predictions)**2))/(len(newX)-len(newX.columns))

    # Note if you don't want to use a DataFrame replace the two lines above with
    # newX = np.append(np.ones((len(X),1)), X, axis=1)
    # MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

    var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b
    #print(newX)
    p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-len(X.columns)))) for i in ts_b]

    sd_b = np.round(sd_b,3)
    ts_b = np.round(ts_b,3)
    p_values = np.round(p_values,3)
    params = np.round(params,4)

    myDF3 = pd.DataFrame()
    myDF3["Coefficients"],myDF3["Standard_Errors"],myDF3["t_stat"],myDF3["P_value"] = [params,sd_b,ts_b,p_values]
    
    return myDF3
    

def regressAll(FF_factors, all_returns, days, train_base=2000, Rf=0.002):
    
    
    dummies_Return_beaters = dummies_beater(all_returns, Rf)
    
    prob_10obs = dummies_Return_beaters.iloc[days - 10 : days].mean().T
    prob_20obs = dummies_Return_beaters.iloc[days - 20 : days].mean().T
    prob_30obs = dummies_Return_beaters.iloc[days - 30 : days].mean().T

    target = dummies_Return_beaters.iloc[days+1]
    
    features_df = pd.DataFrame(columns = ["const",'B1', 'B2', 'B3', 'B4', 'B5',"Pval_C", "Pval1", "Pval2", "Pval3", "Pval4", "Pval5"],
                               index = all_returns.columns)
    
    comps_R = all_returns.drop(columns = ["SPY"]).fillna(0)
    FF_factors = FF_factors.fillna(0)
    for i in comps_R:
        
        regression_stat = FF_regress(FF_factors.iloc[days-train_base:days], comps_R[i].iloc[days-train_base:days])
        betas = list(regression_stat["Coefficients"])
        Pvals = list(regression_stat["P_value"])
        features = betas + Pvals
        features_df.loc[i] = features
    
    features_df["prob_10obs"], features_df["prob_20obs"], features_df["prob_30obs"] = [prob_10obs, prob_20obs, prob_30obs]
    features_df["true_target"] = target
    
    
    return features_df


    

Frequency: 30 min and 2 min

Time series Beta estimation: Measure 5 betas for each 30 min, using 2 min frequency. (Need to know how betas change every 30 min)

Plan B:
1) Use historical return data (2m or higher frequency) of 505 companies in SP500 index to run FF for each company.

2) Calculate probability of beating market in certain period (if APPL 2m Sharpe beated the market 5 times in last 30 min, prob of beating is 5/15 = 0.333).

3) put 1) and 2) together to get a all companies' prob of winning in given period and their Betas.

4) Model probability from 3)

5) Caculate Betas for all 10,000 simulated potfolio

6) get the probability of winning

How to make it predictive?

Model Construction: This may not work

1) model the probability of 30 min winning with 5 betas(5 features) we eatimate every 30 min. (can use SVM, decision tree, regression...)

2) measure betas of winner-loser portfolios spread for same time length for our portolios

3) plug betas of portfolios to the model in 1)

4) model true prob of winning of portfolios on fitted value from 3)

5) make analyical model to preidictive model

The reason we estimate the probability of picking a winner portfolio with given beta is because as we move from Time(T) T1 to T2, the stock price changes, so our weight also changes. We are essensially move from one simulated portfolio to another simulated portfolio as the time goes.


What ML can apply here?

What would the result be if use unsupervised grouping to caculate factors? (Maybe useful when FF doesn't explain well in term of R^2)

Any potential non-linear relationship?

How to optimize time scale to ensure the highest predictive ability? or would models' ability consistant accross time series mapping?

How to test predictivity?

Any trade off between high winning prob and portfolio sharpe?

# Data Collection

In [7]:
print(os.getcwd())
print(os.listdir(os.getcwd()))

/home/bedu/CSS100_Project
['CSS100.ipynb', '.ipynb_checkpoints', 'SPYn500_30m_close.csv', 'Fama_French_info.csv', 'winner_loser_spread_data.csv', '.git']


In [8]:
#os.chdir("/Users/liuhengjia/Desktop")

In [10]:
FF_info = pd.read_csv("Fama_French_info.csv", index_col = 0)

In [11]:
str1 = ' '
index_list = FF_info.index.tolist()
index_list.append("SPY")
#index_list.reverse()
total_string = str1.join(index_list)

In [12]:
#SPYn500_30m_df = yf.download(total_string, start = '2021-10-11', end = "2021-11-18", interval = '30m')
#SPYn500_2m_df = yf.download(total_string, start = '2021-10-11', end = "2021-11-18", interval = '2m')

In [13]:
#SPYn500_30m_df["Adj Close"].to_csv("SPYn500_30m_close.csv")
#SPYn500_2m_df["Adj Close"].to_csv("SPYn500_2m_close.csv")

In [16]:
sp_2m_close = pd.read_csv("SPYn500_30m_close.csv", index_col = 0)
#sp_30m_close = pd.read_csv("data/SPYn500_30m_close.csv", index_col = 0)

In [17]:
sp_2m_close.iloc[:3]

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XEL,XLNX,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-11 09:30:00-04:00,151.880005,20.120001,214.960007,143.929993,111.239998,122.559998,335.290009,118.489998,327.940002,578.719971,...,62.715,157.759995,62.610001,58.150002,120.910004,123.790001,146.869995,499.0,64.120003,198.039993
2021-10-11 10:00:00-04:00,151.809998,20.155001,215.559998,144.701096,111.160004,123.07,337.554993,118.800003,328.140015,582.02002,...,62.93,158.820007,62.669998,57.970001,120.809998,123.660004,147.75,500.059998,64.269997,198.820007
2021-10-11 10:30:00-04:00,152.039993,20.245001,214.880005,144.445007,111.135002,123.080002,337.950012,118.666397,328.119995,580.059998,...,62.790001,158.580002,62.595001,58.029999,120.485001,123.519997,146.75,500.789612,63.959999,198.339996


# Calculate log-return (assuming return is log-normal) for both time interval

In [18]:
total_2m_return = return_df(sp_2m_close)
#total_30m_return = return_df(sp_30m_close)

In [19]:
total_2m_return = total_2m_return.dropna(how = "all")

In [20]:
total_2m_return.iloc[:3]

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XEL,XLNX,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-11 10:00:00-04:00,-0.000461,0.001738,0.002787,0.005343,-0.000719,0.004153,0.006733,0.002613,0.00061,0.005686,...,0.003422,0.006697,0.000958,-0.0031,-0.000827,-0.001051,0.005974,0.002122,0.002337,0.003931
2021-10-11 10:30:00-04:00,0.001514,0.004455,-0.00316,-0.001771,-0.000225,8.1e-05,0.00117,-0.001125,-6.1e-05,-0.003373,...,-0.002227,-0.001512,-0.001197,0.001034,-0.002694,-0.001133,-0.006791,0.001458,-0.004835,-0.002417
2021-10-11 11:00:00-04:00,-0.001712,0.001589,-0.005647,3.5e-05,-0.000495,0.001908,-0.002637,0.000115,-0.000762,-1.7e-05,...,-0.004469,-0.000568,0.002473,-0.003452,-0.000706,-0.001539,-0.005329,0.000899,0.00203,-0.001312


# Defining components return, market return, dummies beater

In [21]:
sp500_2m_return = total_2m_return.drop(columns=["SPY"])

In [22]:
#Rm_30m = total_30m_return[['SPY']]
Rm_2m = total_2m_return[['SPY']]
Rm_2m = Rm_2m.dropna(how="all")
#We need to discount Monthly Rf to 2m and 30m Rf using yield curve(continuously compounded)
Rf = 0.000001

In [23]:
dummies_2m_beater = dummies_beater(total_2m_return, Rf, option="sharpe")

## Feature Engineering

### Calculate FF factors

In [24]:
FF_info_labeled = FF_assign_label(FF_info)
FF_classified_list = FF_factor_classifier(FF_info_labeled)
FF_2m_classes_return = FF_classes_return(sp500_2m_return,FF_classified_list)
FF_2m_factor = FF_calc_factors(FF_2m_classes_return)
FF_2m_factor["Rm-Rf"] = Rm_2m - Rf

In [25]:
#equity_selection = sp_2m_close.drop(columns=["SPY"]).sample(n=10, axis="columns")
#portfolio_2m_return = portfolios_return_df(equity_selection, total_2m_return)
#portfolio_2m_return.to_csv("portfolio_2m_return.cvs")

In [26]:
portfolio_2m_return = pd.read_csv("data/portfolio_2m_return.csv", index_col = 0)

FileNotFoundError: [Errno 2] No such file or directory: 'data/portfolio_2m_return.csv'

In [None]:
portfolio_2m_return

In [28]:
portfolio_2m_return_Rm = portfolio_2m_return.copy()
portfolio_2m_return_Rm["SPY"] = total_2m_return["SPY"]

NameError: name 'portfolio_2m_return' is not defined

In [None]:
def performence_df(FF_2m_factor,total_2m_return, portfolio_2m_return_Rm):

    performence = pd.DataFrame(columns=["precision", "recall", "f1"])

    days = 2000

    while days < len(portfolio_2m_return_Rm)-1:
    
        portfolio_feature = regressAll(FF_2m_factor,portfolio_2m_return_Rm, days ,2000)
        
        portfolio_feature = portfolio_feature.drop(index=["SPY"]).fillna(0)
        
        XYtrain = regressAll(FF_2m_factor,total_2m_return,days,2000)
        XYtrain = XYtrain.fillna(0)
        
        
        sgd_clf = SGDClassifier(random_state = 42)
        model = sgd_clf.fit(XYtrain.drop(columns=["true_target"]),XYtrain["true_target"])
    
        prediction = model.predict(portfolio_feature.drop(columns=["true_target"]))
        portfolio_feature["prediction"] = prediction
    
        precision = precision_score(portfolio_feature["true_target"], portfolio_feature["prediction"])
        recall = recall_score(portfolio_feature["true_target"], portfolio_feature["prediction"])
        f1 = f1_score(portfolio_feature["true_target"], portfolio_feature["prediction"])
    
        score_series = pd.Series([precision, recall, f1], index = performence.columns)
        performence = performence.append(score_series, ignore_index=True)
    
        days += 1
    
    return performence

In [27]:
p1 = performence_df(FF_2m_factor,total_2m_return, portfolio_2m_return_Rm)

NameError: name 'performence_df' is not defined

In [None]:
sgd_clf = SGDClassifier(random_state = 42)
model = sgd_clf.fit(XYtrain.drop(columns=["true_target"]),XYtrain["true_target"])

## Feature Engineering

In [None]:
sp500_2m_return = total_2m_return.dropna(how = "all")

In [None]:
#combine betas measured above with 15 min prob of beating

In [None]:
#Regress prob of beating on observed betas

In [None]:
time_length = range(len(dummies_2m_beaters.index[:1000]))
winner_rate_list = []
for length in tqdm(time_length):
    winner_rate = dummies_2m_beaters["GE"].iloc[:length].mean()
    winner_rate_list.append(winner_rate)
    
plt.scatter(time_length, winner_rate_list, s=3)

In [None]:
FF_regress(FF_2m_factor.fillna(0),sp500_2m_return["AAPL"].fillna(0)-Rf )

In [None]:
regressAll(FF_2m_factor.fillna(0), total_2m_return.fillna(0)-Rf, days=2000)

## Binary Classification and Ensemble Learning

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [30]:
def binary_classification(features, target_variables):
    # ensemble methods: all binary classification
    #ensemble modeling
    X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

        # 3 binary classifications
    log_clf = LogisticRegression(solver="lbfgs", random_state=42)
    rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    svm_clf = SVC(gamma="scale", random_state=42)

    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
        voting='soft')
        # model averaging
        # soft voting
    return (log_clf + rnd_clf + svm_clf)/3
        # if 1 is more, choose one;


In [32]:
binary_classification(total_2m_return,None)

TypeError: unsupported operand type(s) for +: 'LogisticRegression' and 'RandomForestClassifier'