# Crypto predictions with AutoGluon
Predict if next days Close-value will go up or down and with what probability

In [1]:
# import
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime as dt
from pandas.tseries.offsets import DateOffset
import sys
sys.path.append('../the_new_Krypto')
import preprocess as pp # preprocess.py

# !conda activate gluon
# conda install -c ranaroussi yfinance


In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [3]:
print('Versioner av softvara som används i denna notebook:')
# print numpy version
print(f'numpy=={np.__version__}')
# print pandas version
print(f'pandas=={pd.__version__}')
# print matplotlib version
print('matplotlib=={}'.format(plt.matplotlib.__version__))  # type: ignore
# print yfinance version
print('yfinance=={}'.format(yf.__version__))
# print pytreands version
print('pytrends==4.8.0  by hand')
# print python-dateutil version
print('python-dateutil==2.8.2  by hand')
# print ta version
print('ta==0.10.2  by hand')
# print streamlit version
# print('streamlit=={}'.format(st.__version__))
# print('xgboost=={}'.format(xgb.__version__))
# print('catboost=={}'.format(catboost.__version__))
# print('sklearn=={}'.format(sklearn.__version__))
# print pandas-ta version
print('pandas-ta==0.3.14b0  by hand')
# print plotly version
print('plotly==5.3.1  by hand')
# print pickle version
print('pickle-versionen är samma som python-versionen')
# print python versionen
print('python=={}'.format(sys.version))


Versioner av softvara som används i denna notebook:
numpy==1.24.3
pandas==1.5.3
matplotlib==3.7.1
yfinance==0.2.18
pytrends==4.8.0  by hand
python-dateutil==2.8.2  by hand
ta==0.10.2  by hand
pandas-ta==0.3.14b0  by hand
plotly==5.3.1  by hand
pickle-versionen är samma som python-versionen
python==3.10.11 | packaged by conda-forge | (main, May 10 2023, 18:51:25) [MSC v.1934 64 bit (AMD64)]


Gold and inflation

In [4]:
### Gold #####
def get_gold_data():
    df_dates = pd.DataFrame(pd.date_range(
        '1988-12-01', pd.to_datetime('today').date()), columns=['Date'])
    df_dates.set_index('Date', inplace=True)
    # Hämta historiska guldprisdata (GLD är ticker-symbolen för SPDR Gold Shares ETF)
    gld_data = yf.download('GLD', end=dt.today().date(), progress=False)
    # gld_data.set_index('Date', inplace=True)

    # Behåll endast 'Close' priser och döp om kolumnen till 'GLDUSDT'
    gld_data = gld_data[['Close']].rename(columns={'Close': 'GLD-USD'})

    df_dates = pd.DataFrame(pd.date_range(start=gld_data.index[0], end=pd.to_datetime(  # type: ignore
        'today').date(), freq='D'), columns=['Date'])  # type: ignore

    df_dates.set_index('Date', inplace=True)
    gld_data = df_dates.merge(gld_data, how='left',
                              left_on='Date', right_index=True)
    # interpolating missing values
    gld_data.interpolate(method='linear', inplace=True)
    return gld_data


### Inflation ###

def add_horizon_columns(inflation, horizons):
    for horizon in horizons:
        inflation['US_inflation_' +
                  str(horizon)] = inflation['US_inflation'].rolling(horizon, 1).mean()
        inflation['SE_inflation_' +
                  str(horizon)] = inflation['SE_inflation'].rolling(horizon, 1).mean()
    return inflation


def initiate_data(inflation, df_dates, lang_dict, value_name):
    inflation = inflation.melt(
        id_vars=['Year'], var_name='month', value_name=value_name)
    inflation['month'] = inflation['month'].map(lang_dict)
    inflation['date'] = pd.to_datetime(inflation['Year'].astype(
        str) + '-' + inflation['month'].astype(str))
    inflation.set_index('date', inplace=True)
    inflation.drop(['Year', 'month'], axis=1, inplace=True)
    inflation = df_dates.merge(
        inflation, how='left', left_on='date', right_index=True)
    inflation.set_index('date', inplace=True)
    inflation[value_name] = inflation[value_name].astype(str)
    inflation[value_name] = inflation[value_name].str.replace(',', '.')
    inflation[value_name] = inflation[value_name].str.replace(
        chr(8209), chr(45))
    inflation[value_name] = inflation[value_name].astype(float)
    inflation[value_name].interpolate(method='linear', inplace=True)
    return inflation

def get_inflation_data():
    df_dates = pd.DataFrame(pd.date_range(
        '1988-12-01', pd.to_datetime('today').date()), columns=['date'])

    US_inflation = pd.read_html(
        'https://www.usinflationcalculator.com/inflation/current-inflation-rates/')
    US_inflation = US_inflation[0]
    US_inflation.replace(to_replace=r'^Avail.*$',
                         value=np.nan, regex=True, inplace=True)
    US_inflation.columns = US_inflation.iloc[0]
    US_inflation.drop(US_inflation.index[0], inplace=True)
    US_inflation.drop('Ave', axis=1, inplace=True)

    SE_inflation = pd.read_html(
        'https://www.scb.se/hitta-statistik/statistik-efter-amne/priser-och-konsumtion/konsumentprisindex/konsumentprisindex-kpi/pong/tabell-och-diagram/konsumentprisindex-med-fast-ranta-kpif-och-kpif-xe/kpif-12-manadersforandring/')
    SE_inflation = SE_inflation[0]
    SE_inflation.rename(columns={'År': 'Year'}, inplace=True)

    se_dict = dict(Jan='1', Feb='2', Mar='3', Apr='4', Maj='5', Jun='6',
                   Jul='7', Aug='8', Sep='9', Okt='10', Nov='11', Dec='12')
    us_dict = dict(Jan='1', Feb='2', Mar='3', Apr='4', May='5', Jun='6',
                   Jul='7', Aug='8', Sep='9', Oct='10', Nov='11', Dec='12')

    SE_inflation = initiate_data(
        SE_inflation, df_dates, se_dict, value_name='SE_inflation')
    SE_inflation['SE_inflation'] = SE_inflation['SE_inflation'] / 10

    US_inflation = initiate_data(
        US_inflation, df_dates, us_dict, value_name='US_inflation')

    inflations = pd.concat([US_inflation, SE_inflation], axis=1)
    inflations = inflations.dropna()
    # inflations = add_horizon_columns(inflations, [75, 90, 250])
    return inflations




Get Data from Yahoo Finance

In [5]:
def get_from_Yahoo_Finance(tickers, version=0, time_period='2y'):
    
    # placeholder = st.empty()  # Create an empty placeholder
    # placeholder.info(f'Getting data for {len(tickers)} crypto currencies - Please wait...')
    
    yf_data = yf.download(tickers, interval='1d',
                          period=time_period, group_by='ticker', auto_adjust=True, progress=True)

    df_curr = pd.DataFrame(yf_data.xs('Close', axis=1, level=1))
    df_vol = pd.DataFrame(yf_data.xs('Volume', axis=1, level=1))
    
    # placeholder.empty()
    
    return df_curr, df_vol


Get all the data

In [6]:
def read_ticker_names(filenam='../the_new_Krypto/yf_tickers.txt'):
    with open(filenam, 'r') as f:
        ticker_names = f.read().splitlines()
    print(f'{len(ticker_names)} yFinance ticker_names')
    return ticker_names

In [7]:
# tickers
tickers=read_ticker_names()
df_curr,df_vol = get_from_Yahoo_Finance(tickers,)
df_gold = get_gold_data()
df_inflation = get_inflation_data()

321 yFinance ticker_names
[*********************100%***********************]  321 of 321 completed


### Preprocess data

In [8]:
print('Kolla om detta ger bättre resultat')
if True:
# interpolate missing values
    df_curr.interpolate(method='linear', inplace=True)
    df_vol.interpolate(method='linear', inplace=True)
    print('med yfinance')
    display(df_curr.head(3))
    display(df_vol.head(3))


Kolla om detta ger bättre resultat
med yfinance


Unnamed: 0_level_0,GNO-USD,RIF-USD,ACM-USD,WAVES-USD,MOB-USD,BURGER-USD,CELO-USD,UMA-USD,EPS-USD,ALICE-USD,...,FLM-USD,ADA-USD,AMB-USD,CVX-USD,MFT-USD,NBS-USD,XVG-USD,1INCH-USD,FTT-USD,CVP-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-27,191.525055,0.209609,0.03012,16.39887,14.627278,7.410538,3.599451,14.530734,0.216379,7.753805,...,0.592306,1.64596,0.038831,7.910041,0.011006,0.014858,0.027811,3.255687,35.836426,1.465102
2021-05-28,171.599838,0.19911,0.030496,14.068902,16.628323,6.413979,3.413656,13.25295,0.434399,6.890769,...,0.502038,1.511987,0.033906,7.026287,0.009392,0.012849,0.024012,2.759031,32.254406,1.227232
2021-05-29,160.975143,0.191774,0.02828,13.12143,17.362625,6.222192,3.442431,12.181506,0.421569,5.964174,...,0.46367,1.406232,0.03219,6.195031,0.008963,0.012092,0.02279,2.569586,30.340425,1.125489


Unnamed: 0_level_0,GNO-USD,RIF-USD,ACM-USD,WAVES-USD,MOB-USD,BURGER-USD,CELO-USD,UMA-USD,EPS-USD,ALICE-USD,...,FLM-USD,ADA-USD,AMB-USD,CVX-USD,MFT-USD,NBS-USD,XVG-USD,1INCH-USD,FTT-USD,CVP-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-27,1796402,1572030,12773,151996006,3062254,40219200,21038142,22594089,25283,92816410,...,23805175,4266751905,716516,1400783,5538926,4837811,21529190,107592569,99382679,3801057
2021-05-28,2459218,2912272,15122,162062799,5396921,43985101,22065402,25619326,34073,70528790,...,22113082,5373114660,706496,1000673,5065282,3262131,24733175,124555543,148845405,3924341
2021-05-29,1879760,1432368,21995,118902148,2920052,24143428,28023612,21517637,23379,38833587,...,14435546,3902596499,486407,2160795,2968137,2648372,16391510,101283056,110270744,2817613


Compute and add new columns


In [9]:
df_dict = {}
df_preprocessed = pp.preprocessing_currency(df_curr)
assert df_preprocessed is not None, 'df_preprocessed is None'
print('Antal NaN', df_preprocessed.isna().any().sum())

print()
df_dict = {}  # en dictionary med samtlig nya dataframes
for cnt, column_name in enumerate(df_preprocessed.columns):
    df_name = f'df_{column_name}'
    df_dict[df_name] = pp.preprocess(
        df_curr[[column_name]], df_vol[[column_name]], df_gold, df_inflation)
    print(cnt+1, column_name)

display(df_dict.keys())


Antal NaN 0

1 GNO-USD
2 RIF-USD
3 ACM-USD
4 WAVES-USD
5 MOB-USD
6 BURGER-USD
7 CELO-USD
8 UMA-USD
9 EPS-USD
10 ALICE-USD
11 FIL-USD
12 ANT-USD
13 XNO-USD
14 WRX-USD
15 YFI-USD
16 NEAR-USD
17 AUDIO-USD
18 REEF-USD
19 RPL-USD
20 ARB-USD
21 BADGER-USD
22 OGN-USD
23 IRIS-USD
24 SUSHI-USD
25 ICX-USD
26 DOGE-USD
27 XLM-USD
28 BNB-USD
29 OG-USD
30 BAR-USD
31 ZEN-USD
32 ALGO-USD
33 COCOS-USD
34 TVK-USD
35 RAY-USD
36 RLC-USD
37 MBOX-USD
38 QNT-USD
39 BAKE-USD
40 CFX-USD
41 HARD-USD
42 LTC-USD
43 SXP-USD
44 OXT-USD
45 YFII-USD
46 CELR-USD
47 BAL-USD
48 FIO-USD
49 XMR-USD
50 AVAX-USD
51 SYS-USD
52 GHST-USD
53 BNX-USD
54 MITH-USD
55 LOOM-USD
56 BETH-USD
57 BTT-USD
58 MKR-USD
59 WIN-USD
60 ILV-USD
61 BLZ-USD
62 NEO-USD
63 DOCK-USD
64 SCRT-USD
65 PHB-USD
66 ARDR-USD
67 LIT-USD
68 GAS-USD
69 BCH-USD
70 MLN-USD
71 EGLD-USD
72 APE-USD
73 TLM-USD
74 BNBDOWN-USD
75 ADAUP-USD
76 BAND-USD
77 PERP-USD
78 CHZ-USD
79 DGB-USD
80 ATA-USD
81 ARPA-USD
82 AVA-USD
83 KEY-USD
84 TRU-USD
85 STMX-USD
86 LSK-USD
87 AR

dict_keys(['df_GNO-USD', 'df_RIF-USD', 'df_ACM-USD', 'df_WAVES-USD', 'df_MOB-USD', 'df_BURGER-USD', 'df_CELO-USD', 'df_UMA-USD', 'df_EPS-USD', 'df_ALICE-USD', 'df_FIL-USD', 'df_ANT-USD', 'df_XNO-USD', 'df_WRX-USD', 'df_YFI-USD', 'df_NEAR-USD', 'df_AUDIO-USD', 'df_REEF-USD', 'df_RPL-USD', 'df_ARB-USD', 'df_BADGER-USD', 'df_OGN-USD', 'df_IRIS-USD', 'df_SUSHI-USD', 'df_ICX-USD', 'df_DOGE-USD', 'df_XLM-USD', 'df_BNB-USD', 'df_OG-USD', 'df_BAR-USD', 'df_ZEN-USD', 'df_ALGO-USD', 'df_COCOS-USD', 'df_TVK-USD', 'df_RAY-USD', 'df_RLC-USD', 'df_MBOX-USD', 'df_QNT-USD', 'df_BAKE-USD', 'df_CFX-USD', 'df_HARD-USD', 'df_LTC-USD', 'df_SXP-USD', 'df_OXT-USD', 'df_YFII-USD', 'df_CELR-USD', 'df_BAL-USD', 'df_FIO-USD', 'df_XMR-USD', 'df_AVAX-USD', 'df_SYS-USD', 'df_GHST-USD', 'df_BNX-USD', 'df_MITH-USD', 'df_LOOM-USD', 'df_BETH-USD', 'df_BTT-USD', 'df_MKR-USD', 'df_WIN-USD', 'df_ILV-USD', 'df_BLZ-USD', 'df_NEO-USD', 'df_DOCK-USD', 'df_SCRT-USD', 'df_PHB-USD', 'df_ARDR-USD', 'df_LIT-USD', 'df_GAS-USD', 'df

Concat all the DataFrames in dictionary df_dict

In [10]:
the_dict = df_dict.copy()
# Slå ihop alla df_dict till  df2 med gemnsamma kolumner
for df_name in the_dict:
    the_dict[df_name] = the_dict[df_name].rename(
        columns={the_dict[df_name].columns[0]: 'Close'})
    # reset index in df_dict[df_name] but keep Date column
    the_dict[df_name] = the_dict[df_name].reset_index(drop=False)
    # add Ticker column to df_dict[df_name]
    the_dict[df_name]['Ticker'] = df_name

df = pd.concat(the_dict.values(), axis=0)
df.sort_values(by=['Date'], inplace=True)
df.set_index('Date', inplace=True)


In [11]:
display(df)
display(df.columns)


Unnamed: 0_level_0,Close,Ratio_2,Trend_2,Ratio_5,Trend_5,Ratio_30,Trend_30,Ratio_60,Trend_60,Ratio_90,...,GLD_Ratio_2,GLD_Ratio_5,GLD_Ratio_30,GLD_Ratio_60,GLD_Ratio_90,GLD_Ratio_250,diff,before_kvot,before_up,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-28,171.599838,0.945128,171.599838,0.945128,171.599838,0.945128,171.599838,0.945128,171.599838,0.945128,...,1.001882,1.004358,1.033973,1.059486,1.074311,1.039710,0.000000,,0,df_GNO-USD
2021-05-28,3.300058,0.903618,3.300058,0.903618,3.300058,0.903618,3.300058,0.903618,3.300058,0.903618,...,1.001882,1.004358,1.033973,1.059486,1.074311,1.039710,0.000000,,0,df_KAVA-USD
2021-05-28,0.199745,0.930932,0.199745,0.930932,0.199745,0.930932,0.199745,0.930932,0.199745,0.930932,...,1.001882,1.004358,1.033973,1.059486,1.074311,1.039710,0.000000,,0,df_BLZ-USD
2021-05-28,54.487900,0.932094,54.487900,0.932094,54.487900,0.932094,54.487900,0.932094,54.487900,0.932094,...,1.001882,1.004358,1.033973,1.059486,1.074311,1.039710,0.000000,,0,df_NEO-USD
2021-05-28,1.511987,0.957576,1.511987,0.957576,1.511987,0.957576,1.511987,0.957576,1.511987,0.957576,...,1.001882,1.004358,1.033973,1.059486,1.074311,1.039710,0.000000,,0,df_ADA-USD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-27,0.211305,1.003275,0.210615,0.997733,0.211785,0.954219,0.221442,0.861930,0.245153,0.834507,...,1.000000,0.996892,0.976203,0.975750,0.990604,1.054126,0.043568,1.006572,1,df_WAN-USD
2023-05-27,1.238432,1.004255,1.233185,0.991687,1.248813,1.249386,0.991232,1.453656,0.851943,1.706434,...,1.000000,0.996892,0.976203,0.975750,0.990604,1.054126,0.754053,1.008546,1,df_TOMO-USD
2023-05-27,0.009013,1.002290,0.008993,1.003445,0.008982,1.068430,0.008436,1.050648,0.008579,0.866319,...,1.000000,0.996892,0.976203,0.975750,0.990604,1.054126,0.016671,1.004590,1,df_BTCDOWN-USD
2023-05-27,10.517837,0.999208,10.526172,1.002015,10.496685,0.966951,10.877316,0.943273,11.150366,0.923260,...,1.000000,0.996892,0.976203,0.975750,0.990604,1.054126,1.210031,0.998417,0,df_ATOM-USD


Index(['Close', 'Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up',
       'Ticker'],
      dtype='object')

Compute the target column (y)

In [12]:
# set Tommorow's price to be Close next day
df['Tomorrow'] = df.groupby('Ticker')['Close'].shift(-1)

# compute the percentage change Kan inte användas i fit
df['Percentage Change'] = (df['Tomorrow'] - df['Close']) / df['Close'] * 100

df['y'] = (df['Tomorrow'] > df['Close']).astype(int)
df.dropna(inplace=True)

Create the train and test sets

In [13]:
# create train and test datasets from df
portion = 0.75
df_train = df.iloc[:int(df.shape[0]*portion)]
df_test = df.iloc[int(df.shape[0]*portion):]

# save as csv files
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

setup data for AutoGluon

In [14]:
df_train = TabularDataset('df_train.csv')
df_test = TabularDataset('df_test.csv')
df_train.columns

Index(['Date', 'Close', 'Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up',
       'Ticker', 'Tomorrow', 'Percentage Change', 'y'],
      dtype='object')

In [15]:

df_test.sort_values(by=['Ticker','Date'])[['Date','Tomorrow','Close','before_kvot', 'before_up','Percentage Change','y','Ticker']].tail(10)

Unnamed: 0,Date,Tomorrow,Close,before_kvot,before_up,Percentage Change,y,Ticker
52675,2023-05-17,0.228651,0.232318,1.014006,1,-1.578439,0,df_ZRX-USD
52862,2023-05-18,0.228564,0.228651,0.984216,0,-0.038053,0,df_ZRX-USD
53406,2023-05-19,0.227959,0.228564,0.999619,0,-0.26469,0,df_ZRX-USD
53489,2023-05-20,0.220032,0.227959,0.997353,0,-3.47738,0,df_ZRX-USD
53855,2023-05-21,0.22047,0.220032,0.965226,0,0.199057,1,df_ZRX-USD
54231,2023-05-22,0.226772,0.22047,1.001991,1,2.858438,1,df_ZRX-USD
54459,2023-05-23,0.220976,0.226772,1.028584,1,-2.555871,0,df_ZRX-USD
54659,2023-05-24,0.217456,0.220976,0.974441,0,-1.592932,0,df_ZRX-USD
54913,2023-05-25,0.218585,0.217456,0.984071,0,0.519186,1,df_ZRX-USD
55226,2023-05-26,0.21967,0.218585,1.005192,1,0.496285,1,df_ZRX-USD


In [16]:
pred_cols = ['Ticker','Ratio_2', 'Trend_2', 'Ratio_5', 'Trend_5', 'Ratio_30',
       'Trend_30', 'Ratio_60', 'Trend_60', 'Ratio_90', 'Trend_90', 'Ratio_250',
       'Trend_250', 'Volume', 'vol_Ratio_2', 'vol_Ratio_5', 'vol_Ratio_30',
       'vol_Ratio_60', 'vol_Ratio_90', 'vol_Ratio_250', 'US_inflation',
       'SE_inflation', 'infl_Ratio_75', 'infl_Ratio_90', 'infl_Ratio_250',
       'GLD-USD', 'GLD_Ratio_2', 'GLD_Ratio_5', 'GLD_Ratio_30', 'GLD_Ratio_60',
       'GLD_Ratio_90', 'GLD_Ratio_250', 'diff', 'before_kvot', 'before_up']

# TODO: Ta bort Close    
# TODO: prova med interpolate ovan  
# TODO: Lär upp på enbart min lista 
#TODO: Ta med ticker 

In [17]:
target='y'
X_train=df_train[pred_cols]
print(df_train.shape,X_train.shape)

y_train=df_train[target]

X_test=df_test[pred_cols]
print(df_test.shape,X_test.shape)
y_test=df_test[target] 


(166315, 40) (166315, 35)
(55439, 40) (55439, 35)


In [18]:
predictor = TabularPredictor(label=target).fit(df_train[pred_cols+[target]])  # target skall vara med i df_train , presets='best_quality'

No path specified. Models will be saved in: "AutogluonModels\ag-20230527_151752\"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230527_151752\"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    166315
Train Data Columns: 35
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1,

[1000]	valid_set's binary_error: 0.2076


	0.7964	 = Validation score   (accuracy)
	13.48s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: LightGBM ...
	0.79	 = Validation score   (accuracy)
	3.07s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.806	 = Validation score   (accuracy)
	81.52s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.8056	 = Validation score   (accuracy)
	108.53s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: CatBoost ...
	0.8028	 = Validation score   (accuracy)
	200.53s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.8012	 = Validation score   (accuracy)
	19.53s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.7976	 = Validation score   (accuracy)
	19.67s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
		Import fastai failed. A quick tip is to install via `pip install auto

In [19]:
# predictor = TabularPredictor.load("AutogluonModels\ag-20230526_211500\")   
predictions = predictor.predict_proba(X_test)
predictions


Unnamed: 0,0,1
0,0.733650,0.266350
1,0.807870,0.192130
2,0.747284,0.252716
3,0.773455,0.226545
4,0.740612,0.259388
...,...,...
55434,0.609810,0.390190
55435,0.617597,0.382403
55436,0.704402,0.295598
55437,0.603542,0.396458


Validate

In [20]:
from sklearn.metrics import roc_auc_score

y_proba = predictor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_proba.loc[:, 1])
print(f"ROC AUC: {roc_auc}")


ROC AUC: 0.5232884022071771


In [21]:
predictor.leaderboard(df_test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.523909,0.8072,0.6582,0.042512,12.574973,0.6582,0.042512,12.574973,1,True,10
1,WeightedEnsemble_L2,0.519147,0.8092,14.733382,1.221176,467.794792,0.010999,0.005976,1.190693,2,True,12
2,RandomForestEntr,0.509623,0.8056,1.161555,0.128963,108.531076,1.161555,0.128963,108.531076,1,True,6
3,RandomForestGini,0.509136,0.806,1.11659,0.133522,81.524433,1.11659,0.133522,81.524433,1,True,5
4,LightGBM,0.506899,0.79,0.092519,0.017999,3.066472,0.092519,0.017999,3.066472,1,True,4
5,LightGBMLarge,0.505745,0.7956,0.355074,0.027005,7.380625,0.355074,0.027005,7.380625,1,True,11
6,ExtraTreesEntr,0.50221,0.7976,1.132465,0.154015,19.668132,1.132465,0.154015,19.668132,1,True,9
7,KNeighborsDist,0.499811,0.5008,8.322719,0.490109,0.317388,8.322719,0.490109,0.317388,1,True,2
8,KNeighborsUnif,0.499324,0.504,8.338981,0.657837,0.314569,8.338981,0.657837,0.314569,1,True,1
9,ExtraTreesGini,0.493714,0.8012,1.076513,0.138533,19.532881,1.076513,0.138533,19.532881,1,True,8


In [22]:
# Create a dataframe with two columns from y_proba.loc[:, 1] and y_test
df_pred = pd.DataFrame({'y_proba': y_proba.loc[:, 1], 'y_test': y_test})
# count how often the predicted value > 0.5 and the  actual value is 1
df_pred['correct_up'] = ((df_pred['y_proba'] > 0.5) & (df_pred['y_test'] == 1)).astype(int)
df_pred['correct_dn '] = ((df_pred['y_proba'] <= 0.5) & (df_pred['y_test'] == 0)).astype(int)
# What is the accuracy of the model?
    
print(df_pred['correct_up'].sum() / sum(df_pred['y_test']))
print(df_pred['correct_dn '].sum() / sum(df_pred['y_test']==0))
print((df_pred['correct_up'].sum()+df_pred['correct_dn '].sum()) / len(df_pred['y_test']))

df_pred

0.3069740921563736
0.7366133391774418
0.5191471707642634


Unnamed: 0,y_proba,y_test,correct_up,correct_dn
0,0.266350,1,0,0
1,0.192130,1,0,0
2,0.252716,1,0,0
3,0.226545,1,0,0
4,0.259388,0,0,1
...,...,...,...,...
55434,0.390190,1,0,0
55435,0.382403,1,0,0
55436,0.295598,0,0,1
55437,0.396458,0,0,1
