# Trading Strategies powered by Machine Learning - Classification

## Logistic Regression with scikit-learn - a simple Introduction (Part 1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [None]:
hours = np.array([0.5, 0.75, 1., 1.25, 1.5, 1.75, 1.75, 2.,
                  2.25, 2.5, 2.75, 3., 3.25, 3.5, 4., 4.25,
                  4.5, 4.75, 5., 5.5])

In [None]:
success = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]) 

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(hours, success)
plt.xlabel("Study Hours", fontsize = 15)
plt.ylabel("Pass/Fail", fontsize = 15)
plt.ylim(-0.2, 1.2)
plt.show()

In [None]:
data = pd.DataFrame({'hours': hours, 'success': success})

In [None]:
data

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression(fit_intercept= True)

In [None]:
lm.fit(data.hours.to_frame(), data.success)

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
data["pred"] = lm.predict(data.hours.to_frame())
data

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(data.hours, data.success, label = "Data")
plt.plot(data.hours, data.pred, color = "red", label = "Linear Regression")
plt.plot(data.hours, np.where(data.pred > 0.5, 1, 0), linestyle = "--", label = "Classification Rule")
plt.legend(fontsize = 13)
plt.yticks(np.arange(-0.2, 1.3, 0.1))
plt.ylim(-0.2, 1.2)
plt.xlabel("Study Hours", fontsize = 15)
plt.ylabel("Pass/Fail", fontsize = 15)
plt.show()

## Logistic Regression with scikit-learn - a simple Introduction (Part 2)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lm = LogisticRegression()

In [None]:
data

In [None]:
lm.fit(data.hours.to_frame(), data.success)

In [None]:
data["pred"] = lm.predict(data.hours.to_frame())
data

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(data.hours, data.success, label = "Data")
plt.plot(data.hours, data.pred, color = "red", label = "Classification")
plt.legend(fontsize = 13)
plt.yticks(np.arange(-0.2, 1.3, 0.1))
plt.ylim(-0.2, 1.2)
plt.xlabel("Study Hours", fontsize = 15)
plt.ylabel("Pass/Fail", fontsize = 15)
plt.show()

In [None]:
proba = lm.predict_proba(data.hours.to_frame())
proba

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(data.hours, data.success, label = "Data")
plt.plot(data.hours, data.pred, color = "red", label = "Classification")
plt.plot(data.hours, proba[:, 0], "m--", label = "Probability Fail")
plt.plot(data.hours, proba[:, 1], "g--", label = "Probability Pass")
plt.legend(fontsize = 13)
plt.yticks(np.arange(-0.2, 1.3, 0.1))
plt.ylim(-0.2, 1.2)
plt.xlabel("Study Hours", fontsize = 15)
plt.ylabel("Pass/Fail", fontsize = 15)
plt.show()

## Getting and Preparing the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [None]:
data = pd.read_csv("five_minute.csv", parse_dates = ["time"], index_col = "time")

In [None]:
data["returns"] = np.log(data.div(data.shift(1)))

In [None]:
data.dropna(inplace = True)

In [None]:
data["direction"] = np.sign(data.returns)

In [None]:
data

In [None]:
data.direction.value_counts()

In [None]:
lags = 5

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = "lag{}".format(lag)
    data[col] = data.returns.shift(lag)
    cols.append(col)
data.dropna(inplace = True)

In [None]:
data

## Predicting Market Direction with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lm = LogisticRegression(C = 1e6, max_iter = 100000, multi_class = "ovr")

In [None]:
lm.fit(data[cols], data.direction)

In [None]:
data["pred"] = lm.predict(data[cols])

In [None]:
data

In [None]:
data.pred.value_counts()

In [None]:
hits = np.sign(data.direction * data.pred).value_counts()

In [None]:
hits

In [None]:
hit_ratio = hits[1.0] / sum(hits)
hit_ratio

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_true = data.direction, y_pred = data.pred)

## In-Sample Backtesting and the Look-ahead-bias

In [None]:
data

In [None]:
data["strategy"] = data.pred * data.returns

In [None]:
data["creturns"] = data["returns"].cumsum().apply(np.exp)
data["cstrategy"] = data["strategy"].cumsum().apply(np.exp)

In [None]:
data[["creturns", "cstrategy"]].plot(figsize = (12 , 8))
plt.show()

In [None]:
data["trades"] = data.pred.diff().fillna(0).abs()

In [None]:
data.trades.value_counts()

In [None]:
data

## Out-Sample Forward Testing

In [None]:
data = pd.read_csv("test_set.csv", parse_dates = ["time"], index_col = "time")

In [None]:
data

In [None]:
data["returns"] = np.log(data.div(data.shift(1)))

In [None]:
data["direction"] = np.sign(data.returns)

In [None]:
data

In [None]:
lags = 5

In [None]:
cols = []
for lag in range(1, lags + 1):
    col = "lag{}".format(lag)
    data[col] = data.returns.shift(lag)
    cols.append(col)
data.dropna(inplace = True)

In [None]:
data

In [None]:
data["pred"] = lm.predict(data[cols])

In [None]:
data

In [None]:
data.pred.value_counts()

In [None]:
hits = np.sign(data.direction * data.pred).value_counts()

In [None]:
hits

In [None]:
hit_ratio = hits[1.0] / sum(hits)
hit_ratio

In [None]:
data["strategy"] = data.pred * data.returns

In [None]:
data["creturns"] = data["returns"].cumsum().apply(np.exp)
data["cstrategy"] = data["strategy"].cumsum().apply(np.exp)

In [None]:
data[["creturns", "cstrategy"]].plot(figsize = (12 , 8))
plt.show()

In [None]:
data["trades"] = data.pred.diff().fillna(0).abs()

In [None]:
data.trades.value_counts()

In [None]:
data