In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from matplotlib.pyplot import subplots
import statsmodels.api as sm

from ISLP import load_data
from ISLP.models import (
    ModelSpec as MS,
    summarize,
)
from ISLP import confusion_table
from ISLP.models import contrast

from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis as qda,
    QuadraticDiscriminantAnalysis as QDA,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [None]:
df = load_data('Weekly')
df

In [None]:
px.imshow(
    (
        df
        .drop(columns=["Year", "Direction"])
        .corr()
        .replace(to_replace=1, value=np.nan)
    ),
    title="Weekly Correlation",
    height=600,
    text_auto=".2f",
)


In [None]:
px.line(
    df,
    y=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
)

In [None]:
px.histogram(
    df,
    x=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
)

In [None]:
px.histogram(
    df,
    x=["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"],
    facet_col="Direction",
)

In [None]:
px.line(
    df,
    y="Volume",
)

In [None]:
px.histogram(
    df,
    x="Volume",
)

In [None]:
predictors = ["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]
response = "Direction"

design = MS(df[predictors])
X = design.fit_transform(df)
y = df[response] == 'Up'

model = sm.Logit(endog=y, exog=X)
results = model.fit()
summarize(results)

In [None]:
probabilities = results.predict()
print(f"{probabilities[:10]=}")

predictions = np.where(probabilities > 0.5, "Up", "Down")
print(f"{predictions[:10]=}")

px.imshow(confusion_table(predictions, df[response]), text_auto=True)


In [None]:
train_mask = df["Year"] < 2009
test_mask = ~train_mask

print(f"{train_mask.sum()=}, {test_mask.sum()=}")

X_train = X[train_mask]
X_test = X[test_mask]

y_train = y[train_mask]
y_test = y[test_mask]

lag_2_predictor = "Lag2"


In [None]:
lag_2_model = sm.Logit(endog=y_train, exog=X_train[["intercept", lag_2_predictor]])
lag_2_results = lag_2_model.fit()
summarize(lag_2_results)

lag_2_probabilities = lag_2_results.predict(X_test[["intercept", lag_2_predictor]])
print(f"{lag_2_probabilities[:10]=}")

lag_2_predictions = np.where(lag_2_probabilities > 0.5, "Up", "Down")
print(f"{lag_2_predictions[:10]=}")

px.imshow(confusion_table(lag_2_predictions, df.loc[test_mask, response]), text_auto=True)


In [None]:
lda = LDA(store_covariance=True)

lda.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{lda.means_=
}

{lda.classes_=
}

{lda.priors_=
}

{lda.scalings_=
}
""")

lda_probabilities = lda.predict(X_test[[lag_2_predictor]])
print(f"{lda_probabilities[:10]=}")

lda_predictions = np.where(lda_probabilities > 0.5, "Up", "Down")
print(f"{lda_predictions[:10]=}")

px.imshow(confusion_table(lda_predictions, df.loc[test_mask, response]), text_auto=True)


In [None]:
qda = QDA(store_covariance=True)

qda.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{qda.means_=
}

{qda.classes_=
}

{qda.priors_=
}

{qda.scalings_=
}
""")

qda_probabilities = qda.predict(X_test[[lag_2_predictor]])
print(f"{qda_probabilities[:10]=}")

qda_predictions = np.where(qda_probabilities > 0.5, "Up", "Down")
print(f"{qda_predictions[:10]=}")

px.imshow(confusion_table(qda_predictions, df.loc[test_mask, response]), text_auto=True)


In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(y=y_train, X=X_train[[lag_2_predictor]])

knn_probabilities = knn.predict(X_test[[lag_2_predictor]])
print(f"{knn_probabilities[:10]=}")

knn_predictions = np.where(knn_probabilities > 0.5, "Up", "Down")
print(f"{knn_predictions[:10]=}")

px.imshow(confusion_table(knn_predictions, df.loc[test_mask, response]), text_auto=True)


In [None]:
nb = GaussianNB()

nb.fit(y=y_train, X=X_train[[lag_2_predictor]])
print(f"""
{nb.class_prior_=
}

{nb.classes_=
}

{nb.theta_=
}
""")

nb_probabilities = nb.predict(X_test[[lag_2_predictor]])
print(f"{nb_probabilities[:10]=}")

nb_predictions = np.where(nb_probabilities > 0.5, "Up", "Down")
print(f"{nb_predictions[:10]=}")

px.imshow(confusion_table(nb_predictions, df.loc[test_mask, response]), text_auto=True)
