In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os
import glob

folder_path = './db/all/'
file_extension = '.csv'

file_paths = glob.glob(os.path.join(folder_path, f'*{file_extension}'))
file_paths = sorted(file_paths, reverse=True)
print(file_paths)

In [None]:
days_in_year = 252
data_list = []
tickers_list = []

for file_path in file_paths:
    ticker_year_df = pd.read_csv(file_path)
    ticker_year_df = ticker_year_df['Close']

    daily_profit_df = ticker_year_df.pct_change()
    daily_profit_df = daily_profit_df.dropna()
    daily_profit_df.head()

    profit = daily_profit_df.mean() * days_in_year * 100
    volatility = daily_profit_df.std() * days_in_year

    file_name = file_path.replace(folder_path, '')
    year = file_name[0:4]
    ticker = file_name[5:].replace(file_extension, '')
    index = f'{ticker}-{year}'

    data_list.append({'index': index, 'year': year, 'ticker': ticker,
                      'profit': profit, 'volatility': volatility})
    
    tickers_list.append(ticker)

df = pd.DataFrame(data_list)

In [None]:
df = df.dropna()
df = df.sort_values(by=['index'])
df['previous_profit'] = df['profit'].shift(periods=1)
df['profit_ratio'] = df['profit'] / df['previous_profit']

min_years = df.groupby('ticker')['year'].transform('min')
df = df[df['year'] != min_years]

print(df.to_markdown())

In [None]:
df = df.sort_values(by=['previous_profit'], ascending=False)
print(df.to_markdown())

In [None]:
threshold_pct = 60
big_profit_df = df.drop(df[df['previous_profit'] < threshold_pct].index)
big_profit_df = big_profit_df.sort_values(by=['year'])

print(f'number of values: {len(big_profit_df)} i.e. {len(big_profit_df)/22}/yr')
print(f'mean: {big_profit_df["profit"].mean()}')
print(f'median: {big_profit_df["profit"].median()}')
print(f'std dev: {big_profit_df["profit"].std()}')

print(big_profit_df.groupby('year')['profit'].agg(['mean', 'median']))

In [None]:
top_n = 12
top_n_profit_df = df.groupby('year').apply(lambda x: x.nlargest(top_n, 'previous_profit')).reset_index(drop=True)

print(f'mean: {top_n_profit_df["profit"].mean()}')
print(f'median: {top_n_profit_df["profit"].median()}')
print(f'std dev: {top_n_profit_df["profit"].std()}')

print(top_n_profit_df.groupby('year')['profit'].agg(['mean', 'median']))

In [None]:
plt.rcParams["figure.figsize"] = (15, 15)

fig, ax = plt.subplots()
ax.scatter(x=df['previous_profit'], y=df['profit'], marker='2', c=df.volatility, cmap='brg')
ax.set_xlabel('previous_profit')
ax.set_ylabel('profit')
ax.axhline(0, color='black', linewidth=2)
ax.axvline(0, color='black', linewidth=2)

plt.grid()
plt.show()

In [None]:
limit1 = 200
df1 = df.drop(df[(df.profit < -limit1) | (df.profit > limit1) | (df.previous_profit < -limit1) | (df.previous_profit > limit1)].index)

plt.rcParams["figure.figsize"] = (15, 15)

fig, ax = plt.subplots()
ax.scatter(x=df1['previous_profit'], y=df1['profit'], marker='2', c=df1.volatility, cmap='brg')
ax.set_xlabel('previous_profit')
ax.set_ylabel('profit')
ax.axhline(0, color='black', linewidth=2)
ax.axvline(0, color='black', linewidth=2)

plt.xlim([-limit1, limit1])
plt.ylim([-limit1, limit1])
plt.grid()
plt.show()