I have been constantly thinking about a thought experiment: say we have the best machine learning model in the world but we are asked to finish a mission impossible--predicting the results from tossing a fair coin using ONLY a series of past results (i.e., a series of heads/tails). What will these machine learning models give us? Will they clearly let us know that all attempts can only lead to a 50:50 probability? Or will they give us a false sense of confidence that they can magically do better than a random guess?

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [2]:
arr = np.random.randint(low=0, high=2, size=100_000)
# Let's not make the arr to large--to simulate the situation of financial market data.
print(arr)
print(len(arr))

[0 1 0 ... 1 1 0]
100000


In [3]:
def task(model, arr):
    col_count = 128
    df = pd.DataFrame()
    for i in range(arr.shape[0] - col_count):
        d = {}
        for j in range(0, col_count):
            d[f'x{j}'] = arr[i+j]
        d['y'] = arr[i + col_count]
        df = df.append(d, ignore_index=True)
    df.iloc[:,col_count] =df.iloc[:,col_count] > 0
    X = df.iloc[:,0:col_count]
    y = df.iloc[:,col_count]
    pos = y[y == True].count()
    neg = y.shape[0] - pos
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(f"{pos=:,}, {neg=:,}", end="; ")
    y_pred = clf.predict(X_train)
    tn, fp, fn, tp = confusion_matrix(y_true=y_train, y_pred=y_pred).ravel()
    accuracy = (tn + tp) / (tn + fp + fn + tp)
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    tnr = tn / (tn + fp)
    balanced_accuracy = (tpr + tnr) / 2
    
    print(f"training set(tpr={tpr*100:2.1f}%, fpr={fpr*100:2.1f}%, tnr={tnr*100:2.1f}%, accuracy={accuracy*100:.1f}%, balanced_accuracy={balanced_accuracy*100:.1f}%)", end="; ")
    
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
    accuracy = (tn + tp) / (tn + fp + fn + tp)
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    tnr = tn / (tn + fp)
    balanced_accuracy = (tpr + tnr) / 2
    print(f"test set(tpr={tpr*100:2.1f}%, fpr={fpr*100:2.1f}%, tnr={tnr*100:2.1f}%, accuracy={accuracy*100:.1f}%, balanced_accuracy={balanced_accuracy*100:.1f}%)")

In [4]:
from sklearn.ensemble import RandomForestClassifier as rfc
task(rfc(), arr.copy())

pos=49,970, neg=49,902; training set(tpr=100.0%, fpr=0.0%, tnr=100.0%, accuracy=100.0%, balanced_accuracy=100.0%); test set(tpr=46.5%, fpr=47.1%, tnr=52.9%, accuracy=49.7%, balanced_accuracy=49.7%)


In [5]:
from sklearn.linear_model import LogisticRegression as lr
task(lr(), arr.copy())

pos=49,970, neg=49,902; training set(tpr=52.9%, fpr=49.5%, tnr=50.5%, accuracy=51.7%, balanced_accuracy=51.7%); test set(tpr=50.1%, fpr=50.8%, tnr=49.2%, accuracy=49.7%, balanced_accuracy=49.7%)


In [6]:
from sklearn.svm import SVC
task(SVC(), arr.copy())

pos=49,970, neg=49,902; training set(tpr=83.9%, fpr=17.4%, tnr=82.6%, accuracy=83.3%, balanced_accuracy=83.2%); test set(tpr=51.0%, fpr=51.4%, tnr=48.6%, accuracy=49.8%, balanced_accuracy=49.8%)


In [7]:
from sklearn.neural_network import MLPClassifier as mlpc
task(mlpc(hidden_layer_sizes=(400, 400, 400, 400, 400)), arr.copy())

pos=49,970, neg=49,902; training set(tpr=99.6%, fpr=0.3%, tnr=99.7%, accuracy=99.6%, balanced_accuracy=99.6%); test set(tpr=48.9%, fpr=48.7%, tnr=51.3%, accuracy=50.1%, balanced_accuracy=50.1%)
