In [430]:
import pandas as pd
import numpy as np


In [431]:
amazon_price = pd.read_csv("AMZN.csv")
amazon_eps = pd.read_csv("earnings_latest.csv").groupby("symbol").get_group("AMZN")
amazon_eps["date"] = pd.to_datetime(amazon_eps["date"])
amazon_price["Date"] = pd.to_datetime(amazon_price["Date"])
start_date = "2009-07-23"
end_date = "2021-04-29"
amazon_price = amazon_price[(amazon_price["Date"] >= start_date) & (amazon_price["Date"] <= end_date)]
amazon_price["election_season"] = (
    ((amazon_price["Date"].dt.year % 4) == 0) & 
    (amazon_price["Date"].dt.month > 7) & 
    (amazon_price["Date"].dt.month < 12)
).astype(int)

In [432]:
eps_values = []
eps_est_values = []

for date in amazon_price["Date"]:
    relevant_eps = amazon_eps[amazon_eps["date"] <= date]
    relevant_eps_est = amazon_eps[amazon_eps["date"] <= date]
    if not relevant_eps.empty:
        eps_values.append(relevant_eps.iloc[-1]["eps"])
        eps_est_values.append(relevant_eps_est.iloc[-1]["eps_est"])
    else:
        eps_values.append(None)
        eps_est_values.append(None)

amazon_price["eps"] = eps_values
amazon_price["eps_est"] = eps_est_values

In [433]:
amazon_price["earnings_beat"] = amazon_price["eps"] > amazon_price["eps_est"]
amazon_price["earnings_beat"] = amazon_price["earnings_beat"].astype(int)

In [434]:
release_times = []
for date in amazon_price["Date"]:
    relevant_eps = amazon_eps[amazon_eps["date"] <= date]
    if not relevant_eps.empty:
        latest_release = relevant_eps.iloc[-1]
        if latest_release["release_time"] == 0: 
            release_times.append((latest_release["date"] - pd.Timedelta(days=1)).date())
        else:  
            release_times.append(latest_release["date"].date())
    else:
        release_times.append(None)
release_times_df = pd.DataFrame({"Date": amazon_price["Date"], "release_time_date": release_times})
amazon_price = amazon_price.merge(release_times_df, on="Date", how="left")

amazon_price["release_time"] = (amazon_price["Date"].dt.date == amazon_price["release_time_date"]).astype(int)

amazon_price.drop(columns=["release_time_date"], inplace=True)


In [435]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score


In [436]:
amazon_price["Tomorrow"] = amazon_price["Close"].shift(-1)
amazon_price["Target"] = (amazon_price["Tomorrow"] > amazon_price["Close"]).astype(int)

In [437]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
train = amazon_price.iloc[:-100]
test = amazon_price.iloc[-100:]
predictors = ["Open", "High", "Low", "Close",  "Adj Close", "Volume", "election_season", "eps", "eps_est","earnings_beat", "release_time"]

In [438]:
model.fit(train[predictors], train["Target"])

In [439]:
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)

In [440]:
precision=precision_score(test["Target"],preds)
precision

np.float64(0.5909090909090909)

In [441]:
count_of_ones = amazon_price[amazon_price['Target'] == 1].shape[0]
ratio_of_ones = count_of_ones / amazon_price.shape[0]
print(f"If you just bought everyday, the positive percentage would be: {ratio_of_ones}")
print(f"My algo is {round((precision - ratio_of_ones)*100,2)}% better")

If you just bought everyday, the positive percentage would be: 0.5318933513331083
My algo is 5.9% better
