# Crypto predictions with AutoGluon
Predict if next days Close-value will go up or down and with what probability

In [1]:
# import
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime as dt
from pandas.tseries.offsets import DateOffset
from IPython.display import display
import sys
sys.path.append('../the_new_Krypto')
import preprocess as pp # preprocess.py

# %conda activate gluon
# %conda install -c ranaroussi yfinance
# %conda install -c conda-forge matplotlib
# %pip install ipython

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [7]:
print('Versioner av softvara som används i denna notebook:')
# print numpy version
print(f'numpy=={np.__version__}')
# print pandas version
print(f'pandas=={pd.__version__}')
# print matplotlib version
print('matplotlib=={}'.format(plt.matplotlib.__version__))  # type: ignore
# print yfinance version
print('yfinance=={}'.format(yf.__version__))
# print pytreands version
print('pytrends==4.8.0  by hand')
# print python-dateutil version
print('python-dateutil==2.8.2  by hand')
# print ta version
print('ta==0.10.2  by hand')
# print streamlit version
# print('streamlit=={}'.format(st.__version__))
# print('xgboost=={}'.format(xgb.__version__))
# print('catboost=={}'.format(catboost.__version__))
# print('sklearn=={}'.format(sklearn.__version__))
# print pandas-ta version
print('pandas-ta==0.3.14b0 ??  by hand')
# print plotly version
import plotly
print(f"plotly=={plotly.__version__}")
# print pickle version
print('pickle-versionen är samma som python-versionen')
# print python versionen
print('python=={}'.format(sys.version))


Versioner av softvara som används i denna notebook:
numpy==1.24.3
pandas==2.0.3
matplotlib==3.7.2
yfinance==0.2.30
pytrends==4.8.0  by hand
python-dateutil==2.8.2  by hand
ta==0.10.2  by hand
pandas-ta==0.3.14b0 ??  by hand
plotly==5.17.0
pickle-versionen är samma som python-versionen
python==3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:24:38) [MSC v.1916 64 bit (AMD64)]


Gold and inflation

In [8]:
### Gold #####
def get_gold_data():
    df_dates = pd.DataFrame(pd.date_range(
        '1988-12-01', pd.to_datetime('today').date()), columns=['Date'])
    df_dates.set_index('Date', inplace=True)
    # Hämta historiska guldprisdata (GLD är ticker-symbolen för SPDR Gold Shares ETF)
    gld_data = yf.download('GLD', end=dt.today().date(), progress=False)
    # gld_data.set_index('Date', inplace=True)

    # Behåll endast 'Close' priser och döp om kolumnen till 'GLDUSDT'
    gld_data = gld_data[['Close']].rename(columns={'Close': 'GLD-USD'})

    df_dates = pd.DataFrame(pd.date_range(start=gld_data.index[0], end=pd.to_datetime(  # type: ignore
        'today').date(), freq='D'), columns=['Date'])  # type: ignore

    df_dates.set_index('Date', inplace=True)
    gld_data = df_dates.merge(gld_data, how='left',
                              left_on='Date', right_index=True)
    # interpolating missing values
    gld_data.interpolate(method='linear', inplace=True)
    return gld_data


### Inflation ###

def add_horizon_columns(inflation, horizons):
    for horizon in horizons:
        inflation['US_inflation_' +
                  str(horizon)] = inflation['US_inflation'].rolling(horizon, 1).mean()
        inflation['SE_inflation_' +
                  str(horizon)] = inflation['SE_inflation'].rolling(horizon, 1).mean()
    return inflation


def initiate_data(inflation, df_dates, lang_dict, value_name):
    inflation = inflation.melt(
        id_vars=['Year'], var_name='month', value_name=value_name)
    inflation['month'] = inflation['month'].map(lang_dict)
    inflation['date'] = pd.to_datetime(inflation['Year'].astype(
        str) + '-' + inflation['month'].astype(str))
    inflation.set_index('date', inplace=True)
    inflation.drop(['Year', 'month'], axis=1, inplace=True)
    inflation = df_dates.merge(
        inflation, how='left', left_on='date', right_index=True)
    inflation.set_index('date', inplace=True)
    inflation[value_name] = inflation[value_name].astype(str)
    inflation[value_name] = inflation[value_name].str.replace(',', '.')
    inflation[value_name] = inflation[value_name].str.replace(
        chr(8209), chr(45))
    inflation[value_name] = inflation[value_name].astype(float)
    inflation[value_name].interpolate(method='linear', inplace=True)
    return inflation

def get_inflation_data():
    df_dates = pd.DataFrame(pd.date_range(
        '1988-12-01', pd.to_datetime('today').date()), columns=['date'])

    US_inflation = pd.read_html(
        'https://www.usinflationcalculator.com/inflation/current-inflation-rates/')
    US_inflation = US_inflation[0]
    US_inflation.replace(to_replace=r'^Avail.*$',
                         value=np.nan, regex=True, inplace=True)
    US_inflation.columns = US_inflation.iloc[0]
    US_inflation.drop(US_inflation.index[0], inplace=True)
    US_inflation.drop('Ave', axis=1, inplace=True)

    SE_inflation = pd.read_html(
        'https://www.scb.se/hitta-statistik/statistik-efter-amne/priser-och-konsumtion/konsumentprisindex/konsumentprisindex-kpi/pong/tabell-och-diagram/konsumentprisindex-med-fast-ranta-kpif-och-kpif-xe/kpif-12-manadersforandring/')
    SE_inflation = SE_inflation[0]
    SE_inflation.rename(columns={'År': 'Year'}, inplace=True)

    se_dict = dict(Jan='1', Feb='2', Mar='3', Apr='4', Maj='5', Jun='6',
                   Jul='7', Aug='8', Sep='9', Okt='10', Nov='11', Dec='12')
    us_dict = dict(Jan='1', Feb='2', Mar='3', Apr='4', May='5', Jun='6',
                   Jul='7', Aug='8', Sep='9', Oct='10', Nov='11', Dec='12')

    SE_inflation = initiate_data(
        SE_inflation, df_dates, se_dict, value_name='SE_inflation')
    SE_inflation['SE_inflation'] = SE_inflation['SE_inflation'] / 10

    US_inflation = initiate_data(
        US_inflation, df_dates, us_dict, value_name='US_inflation')

    inflations = pd.concat([US_inflation, SE_inflation], axis=1)
    inflations = inflations.dropna()
    # inflations = add_horizon_columns(inflations, [75, 90, 250])
    return inflations




Get Data from Yahoo Finance

In [9]:
def get_from_Yahoo_Finance(tickers, version=0, time_period='2y'):
    
    # placeholder = st.empty()  # Create an empty placeholder
    # placeholder.info(f'Getting data for {len(tickers)} crypto currencies - Please wait...')
    
    yf_data = yf.download(tickers, interval='1d',
                          period=time_period, group_by='ticker', auto_adjust=True, progress=True)

    df_curr = pd.DataFrame(yf_data.xs('Close', axis=1, level=1))
    df_vol = pd.DataFrame(yf_data.xs('Volume', axis=1, level=1))
    
    # placeholder.empty()
    
    return df_curr, df_vol


Get all the data

In [10]:
def read_ticker_names(filenam='../the_new_Krypto/yf_tickers.txt'):
    with open(filenam, 'r') as f:
        ticker_names = f.read().splitlines()
    print(f'{len(ticker_names)} yFinance ticker_names')
    return ticker_names

In [11]:
# tickers
tickers=read_ticker_names()
df_curr,df_vol = get_from_Yahoo_Finance(tickers,)
df_gold = get_gold_data()
df_inflation = get_inflation_data()

321 yFinance ticker_names
[*********************100%%**********************]  321 of 321 completed


### Preprocess data

In [12]:
print('Kolla om detta ger bättre resultat')
if True:
# interpolate missing values
    df_curr.interpolate(method='linear', inplace=True)
    df_vol.interpolate(method='linear', inplace=True)
    print('med yfinance')
    display(df_curr.head(3))
    display(df_vol.head(3))


Kolla om detta ger bättre resultat
med yfinance


Unnamed: 0_level_0,ROSE-USD,CAKE-USD,KAVA-USD,CRV-USD,AION-USD,REN-USD,TVK-USD,YFI-USD,MANA-USD,SNX-USD,...,THETA-USD,TKO-USD,NEAR-USD,ARDR-USD,DATA-USD,CFX-USD,VET-USD,ETC-USD,DGB-USD,TCT-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-29,0.146134,18.543509,5.085978,2.289199,0.13814,0.922588,0.17012,28602.537109,0.647256,9.297005,...,4.830629,1.652037,6.671943,0.252643,0.109946,0.292924,0.084675,45.595875,0.042812,0.023819
2021-09-30,0.155826,18.869083,5.386094,2.33526,0.140623,0.937705,0.180831,29179.199219,0.688232,9.590257,...,5.049012,1.717892,6.918915,0.281222,0.113575,0.305543,0.095127,46.976051,0.044691,0.025024
2021-10-01,0.197346,19.950525,5.834096,2.534003,0.154015,1.043975,0.209132,31646.339844,0.752952,10.476235,...,5.99921,1.913176,7.823894,0.310735,0.120752,0.325711,0.108098,51.923729,0.04903,0.027103


Unnamed: 0_level_0,ROSE-USD,CAKE-USD,KAVA-USD,CRV-USD,AION-USD,REN-USD,TVK-USD,YFI-USD,MANA-USD,SNX-USD,...,THETA-USD,TKO-USD,NEAR-USD,ARDR-USD,DATA-USD,CFX-USD,VET-USD,ETC-USD,DGB-USD,TCT-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-29,39487185,301239679,145365340,204193291,6092719,121696865,9332865,268447237,87819395,77949356,...,240286445,24440959,319100829,11115878,8191555,42007041,325932232,788760882,16556781,1294437
2021-09-30,52299426,321481654,145948367,222485084,6613431,114041059,12373595,323193883,88766690,93216958,...,212399093,32128276,273081176,56828415,10943519,37823025,497630517,709962936,20548292,1565576
2021-10-01,248090808,325359269,127369617,224607817,6834351,154784488,27701316,199443601,118813928,107575022,...,423868908,36718663,327776608,25162779,13720106,34838946,863753846,1202916057,24458663,2397771


Compute and add new columns


In [13]:
df_dict = {}
df_preprocessed = pp.preprocessing_currency(df_curr)
assert df_preprocessed is not None, 'df_preprocessed is None'
print('Antal NaN', df_preprocessed.isna().any().sum())

print()
df_dict = {}  # en dictionary med samtlig nya dataframes
for cnt, column_name in enumerate(df_preprocessed.columns):
    df_name = f'df_{column_name}'
    df_dict[df_name] = pp.preprocess(
        df_curr[[column_name]], df_vol[[column_name]], df_gold, df_inflation)
    print(cnt+1, column_name)

display(df_dict.keys())


Antal NaN 0

1 ROSE-USD
2 CAKE-USD
3 KAVA-USD
4 CRV-USD
5 AION-USD
6 REN-USD
7 TVK-USD
8 YFI-USD
9 MANA-USD
10 SNX-USD
11 MOB-USD
12 WRX-USD
13 BOND-USD
14 NMR-USD
15 FIS-USD
16 WAXP-USD
17 AUTO-USD
18 IOTX-USD
19 SFP-USD
20 BIFI-USD
21 ZEC-USD
22 BTCST-USD
23 1INCH-USD
24 SKL-USD
25 AMP-USD
26 ANKR-USD
27 BAR-USD
28 ETH-USD
29 LQTY-USD
30 AERGO-USD
31 ORN-USD
32 SC-USD
33 ACH-USD
34 CLV-USD
35 BNB-USD
36 ADAUP-USD
37 EOS-USD
38 RAY-USD
39 OCEAN-USD
40 KMD-USD
41 DIA-USD
42 FIL-USD
43 LTO-USD
44 ONT-USD
45 SUSD-USD
46 XLM-USD
47 LOOM-USD
48 AKRO-USD
49 SLP-USD
50 WOO-USD
51 OMG-USD
52 AVA-USD
53 ALGO-USD
54 NULS-USD
55 CTSI-USD
56 DENT-USD
57 FET-USD
58 USDC-USD
59 REQ-USD
60 CVX-USD
61 ERN-USD
62 LTC-USD
63 TRB-USD
64 TUSD-USD
65 SOL-USD
66 AXS-USD
67 POLY-USD
68 POLS-USD
69 TORN-USD
70 KEEP-USD
71 SHIB-USD
72 BURGER-USD
73 MLN-USD
74 MBL-USD
75 VTHO-USD
76 TRIBE-USD
77 STRAX-USD
78 ACM-USD
79 AUCTION-USD
80 LSK-USD
81 AVAX-USD
82 OG-USD
83 BTC-USD
84 FLOKI-USD
85 VGX-USD
86 DASH-USD


dict_keys(['df_ROSE-USD', 'df_CAKE-USD', 'df_KAVA-USD', 'df_CRV-USD', 'df_AION-USD', 'df_REN-USD', 'df_TVK-USD', 'df_YFI-USD', 'df_MANA-USD', 'df_SNX-USD', 'df_MOB-USD', 'df_WRX-USD', 'df_BOND-USD', 'df_NMR-USD', 'df_FIS-USD', 'df_WAXP-USD', 'df_AUTO-USD', 'df_IOTX-USD', 'df_SFP-USD', 'df_BIFI-USD', 'df_ZEC-USD', 'df_BTCST-USD', 'df_1INCH-USD', 'df_SKL-USD', 'df_AMP-USD', 'df_ANKR-USD', 'df_BAR-USD', 'df_ETH-USD', 'df_LQTY-USD', 'df_AERGO-USD', 'df_ORN-USD', 'df_SC-USD', 'df_ACH-USD', 'df_CLV-USD', 'df_BNB-USD', 'df_ADAUP-USD', 'df_EOS-USD', 'df_RAY-USD', 'df_OCEAN-USD', 'df_KMD-USD', 'df_DIA-USD', 'df_FIL-USD', 'df_LTO-USD', 'df_ONT-USD', 'df_SUSD-USD', 'df_XLM-USD', 'df_LOOM-USD', 'df_AKRO-USD', 'df_SLP-USD', 'df_WOO-USD', 'df_OMG-USD', 'df_AVA-USD', 'df_ALGO-USD', 'df_NULS-USD', 'df_CTSI-USD', 'df_DENT-USD', 'df_FET-USD', 'df_USDC-USD', 'df_REQ-USD', 'df_CVX-USD', 'df_ERN-USD', 'df_LTC-USD', 'df_TRB-USD', 'df_TUSD-USD', 'df_SOL-USD', 'df_AXS-USD', 'df_POLY-USD', 'df_POLS-USD', 'df_T

Concat all the DataFrames in dictionary df_dict

In [14]:
the_dict = df_dict.copy()
# Slå ihop alla df_dict till  df2 med gemnsamma kolumner
for df_name in the_dict:
    the_dict[df_name] = the_dict[df_name].rename(
        columns={the_dict[df_name].columns[0]: 'Close'})
    # reset index in df_dict[df_name] but keep Date column
    the_dict[df_name] = the_dict[df_name].reset_index(drop=False)
    # add Ticker column to df_dict[df_name]
    the_dict[df_name]['Ticker'] = df_name

df = pd.concat(the_dict.values(), axis=0)
df.sort_values(by=['Date'], inplace=True)
df.set_index('Date', inplace=True)


In [15]:
display(df)
display(df.columns)


Unnamed: 0_level_0,Close,Ratio_2,Trend_2,Ratio_5,Trend_5,Ratio_30,Trend_30,Ratio_60,Trend_60,Ratio_90,...,GLD_Ratio_2,GLD_Ratio_5,GLD_Ratio_30,GLD_Ratio_60,GLD_Ratio_90,GLD_Ratio_250,diff,before_kvot,before_up,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-30,0.155826,1.032097,0.155826,1.032097,0.155826,1.032097,0.155826,1.032097,0.155826,1.032097,...,1.008908,1.007786,0.988171,0.985623,0.980731,0.977999,0.000000,,0,df_ROSE-USD
2021-09-30,0.054869,1.012110,0.054869,1.012110,0.054869,1.012110,0.054869,1.012110,0.054869,1.012110,...,1.008908,1.007786,0.988171,0.985623,0.980731,0.977999,0.000000,,0,df_IOST-USD
2021-09-30,60.012264,1.025144,60.012264,1.025144,60.012264,1.025144,60.012264,1.025144,60.012264,1.025144,...,1.008908,1.007786,0.988171,0.985623,0.980731,0.977999,0.000000,,0,df_FIL-USD
2021-09-30,0.289681,1.045585,0.289681,1.045585,0.289681,1.045585,0.289681,1.045585,0.289681,1.045585,...,1.008908,1.007786,0.988171,0.985623,0.980731,0.977999,0.000000,,0,df_CHR-USD
2021-09-30,38.601673,1.060662,38.601673,1.060662,38.601673,1.060662,38.601673,1.060662,38.601673,1.060662,...,1.008908,1.007786,0.988171,0.985623,0.980731,0.977999,0.000000,,0,df_LUNC-USD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-29,0.170926,1.002362,0.170524,1.008691,0.169454,0.992165,0.172276,0.926883,0.184410,0.880918,...,1.000000,0.990406,0.973380,0.973641,0.968289,0.963412,0.050807,1.004734,1,df_RAY-USD
2023-09-29,0.003699,1.001563,0.003693,1.065521,0.003471,1.071859,0.003451,0.960766,0.003850,0.850653,...,1.000000,0.990406,0.973380,0.973641,0.968289,0.963412,0.001144,1.003130,1,df_XVG-USD
2023-09-29,232.405228,0.985618,235.796463,1.032928,224.996548,1.127011,206.213805,1.108486,209.660061,1.028565,...,1.000000,0.990406,0.973380,0.973641,0.968289,0.963412,71.580928,0.971644,0,df_BCH-USD
2023-09-29,0.018190,0.996129,0.018260,0.977951,0.018600,0.974358,0.018668,1.030718,0.017648,1.071042,...,1.000000,0.990406,0.973380,0.973641,0.968289,0.963412,0.004411,0.992287,0,df_BNBDOWN-USD


Index(['Close', 'Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up',
       'Ticker'],
      dtype='object')

Compute the target column (y)

In [16]:
# set Tommorow's price to be Close next day
df['Tomorrow'] = df.groupby('Ticker')['Close'].shift(-1)

# compute the percentage change Kan inte användas i fit
df['Percentage Change'] = (df['Tomorrow'] - df['Close']) / df['Close'] * 100

df['y'] = (df['Tomorrow'] > df['Close']).astype(int)
df.dropna(inplace=True)

Create the train and test sets

In [17]:
# create train and test datasets from df
portion = 0.75
df_train = df.iloc[:int(df.shape[0]*portion)]
df_test = df.iloc[int(df.shape[0]*portion):]

# save as csv files
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

setup data for AutoGluon

In [18]:
df_train = TabularDataset('df_train.csv')
df_test = TabularDataset('df_test.csv')
df_train.columns

Index(['Date', 'Close', 'Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up',
       'Ticker', 'Tomorrow', 'Percentage Change', 'y'],
      dtype='object')

In [19]:

df_test.sort_values(by=['Ticker','Date'])[['Date','Tomorrow','Close','before_kvot', 'before_up','Percentage Change','y','Ticker']].tail(10)

Unnamed: 0,Date,Tomorrow,Close,before_kvot,before_up,Percentage Change,y,Ticker
52162,2023-09-19,0.183642,0.182895,1.011459,1,0.408428,1,df_ZRX-USD
52258,2023-09-20,0.18374,0.183642,1.004084,1,0.053367,1,df_ZRX-USD
52713,2023-09-21,0.18685,0.18374,1.000534,1,1.692604,1,df_ZRX-USD
52759,2023-09-22,0.189535,0.18685,1.016926,1,1.436987,1,df_ZRX-USD
53166,2023-09-23,0.184208,0.189535,1.01437,1,-2.810563,0,df_ZRX-USD
53491,2023-09-24,0.182819,0.184208,0.971894,0,-0.754045,0,df_ZRX-USD
53694,2023-09-25,0.182523,0.182819,0.99246,0,-0.161907,0,df_ZRX-USD
53988,2023-09-26,0.184942,0.182523,0.998381,0,1.325318,1,df_ZRX-USD
54427,2023-09-27,0.195703,0.184942,1.013253,1,5.818577,1,df_ZRX-USD
54501,2023-09-28,0.196138,0.195703,1.058186,1,0.222296,1,df_ZRX-USD


In [20]:
pred_cols = ['Ticker','Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up']

# TODO: Ta bort Close    
# TODO: prova med interpolate ovan  
# TODO: Lär upp på enbart min lista 
#TODO: Ta med ticker 

In [21]:
target='y'
X_train=df_train[pred_cols]
print(df_train.shape,X_train.shape)

y_train=df_train[target]

X_test=df_test[pred_cols]
print(df_test.shape,X_test.shape)
y_test=df_test[target] 


(164259, 40) (164259, 35)
(54753, 40) (54753, 35)


In [22]:
predictor = TabularPredictor(label=target).fit(df_train[pred_cols+[target]])  # target skall vara med i df_train , presets='best_quality'

No path specified. Models will be saved in: "AutogluonModels\ag-20230929_153847"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230929_153847"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   192.34 GB / 511.15 GB (37.6%)
Train Data Rows:    164259
Train Data Columns: 35
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])


In [23]:
# predictor = TabularPredictor.load("AutogluonModels\ag-20230526_211500\")   
predictions = predictor.predict_proba(X_test)
predictions


Unnamed: 0,0,1
0,0.888701,0.111299
1,0.808824,0.191176
2,0.808681,0.191319
3,0.600413,0.399587
4,0.873127,0.126873
...,...,...
54748,0.734407,0.265593
54749,0.723589,0.276411
54750,0.747876,0.252124
54751,0.726895,0.273105


Validate

In [24]:
from sklearn.metrics import roc_auc_score

y_proba = predictor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_proba.loc[:, 1])
print(f"ROC AUC: {roc_auc}")


ROC AUC: 0.5449247826250987


In [25]:
predictor.leaderboard(df_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.542454,0.7668,0.359679,0.046876,7.313229,0.359679,0.046876,7.313229,1,True,3
1,CatBoost,0.541377,0.7696,0.106746,0.013191,84.171661,0.106746,0.013191,84.171661,1,True,7
2,RandomForestGini,0.534163,0.7756,0.976599,0.124298,115.062391,0.976599,0.124298,115.062391,1,True,5
3,WeightedEnsemble_L2,0.533578,0.7836,45.618335,3.199539,2233.217165,0.0,0.0,1.467513,2,True,14
4,RandomForestEntr,0.531021,0.7768,1.082957,0.153406,153.897933,1.082957,0.153406,153.897933,1,True,6
5,XGBoost,0.526875,0.7732,0.416266,0.033211,5.865764,0.416266,0.033211,5.865764,1,True,11
6,LightGBM,0.526072,0.7628,0.06251,0.015628,3.226544,0.06251,0.015628,3.226544,1,True,4
7,ExtraTreesEntr,0.524026,0.77,1.085034,0.130907,20.967728,1.085034,0.130907,20.967728,1,True,9
8,ExtraTreesGini,0.523241,0.7668,1.209167,0.133512,19.641374,1.209167,0.133512,19.641374,1,True,8
9,LightGBMLarge,0.52083,0.772,0.066838,0.0,2.716792,0.066838,0.0,2.716792,1,True,13


In [26]:
# Create a dataframe with two columns from y_proba.loc[:, 1] and y_test
df_pred = pd.DataFrame({'y_proba': y_proba.loc[:, 1], 'y_test': y_test})
# count how often the predicted value > 0.5 and the  actual value is 1
df_pred['correct_up'] = ((df_pred['y_proba'] > 0.5) & (df_pred['y_test'] == 1)).astype(int)
df_pred['correct_dn '] = ((df_pred['y_proba'] <= 0.5) & (df_pred['y_test'] == 0)).astype(int)
# What is the accuracy of the model?
    
print(df_pred['correct_up'].sum() / sum(df_pred['y_test']))
print(df_pred['correct_dn '].sum() / sum(df_pred['y_test']==0))
print((df_pred['correct_up'].sum()+df_pred['correct_dn '].sum()) / len(df_pred['y_test']))

df_pred

0.24577412116679132
0.8083032877592546
0.5335780687816193


Unnamed: 0,y_proba,y_test,correct_up,correct_dn
0,0.111299,0,0,1
1,0.191176,0,0,1
2,0.191319,0,0,1
3,0.399587,0,0,1
4,0.126873,0,0,1
...,...,...,...,...
54748,0.265593,1,0,0
54749,0.276411,1,0,0
54750,0.252124,0,0,1
54751,0.273105,0,0,1
