In [4]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("List available databases and tables").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session

spark.catalog.listDatabases()

[Database(name='2022_10_22', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='2023_04_01', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse')]

In [59]:
from pyspark.sql.functions import regexp_replace

spark.sql("USE 2023_04_01")
    # Assuming that 'ticker' is a valid Python variable

query = f"""SELECT ticker_region FROM sym_ticker_region WHERE ticker_region LIKE "%-US" AND ticker_region NOT LIKE '%.%' """
df = spark.sql(query)
df = df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
ticker_list = [row.ticker_region for row in df.collect()]
print(len(ticker_list))



155276


In [60]:
print(ticker_list[:100])

['AMTCQ', 'AACJWXX', 'FKLBDX', 'FCTFHX', 'FGLVJX', 'ESMT', 'SCEGX', 'ISSSF', 'WNGPF', 'MS5044160', 'SPROW', 'CUVATX', 'RBLTX', 'GIBBF', 'ENIC', 'FSSEX', 'JMPC', 'SFR', 'CCSZX', 'UBS4967480', 'EOBK', 'PDKFX', 'GWLLF', 'GECC', 'UBS5042838', 'VGRQX', 'FLMFX', 'EMCMF', 'PTICX', 'CEUHIX', 'IAPCX', 'LSAVX', 'AZNAX', 'BGGNF', 'PKTEX', 'FSFSX', 'ERND20', 'AVCWQ', 'TSBK', 'KIIIU', 'GMOXX', 'PRFU', 'FOCT', 'ACLLF', 'MPLD', 'ICMPX', 'RBSPF', 'OSIS', 'USWG', 'PSCD', 'PROS38', 'LSCVX', 'MDMXF', 'EGSIX', 'EIVDX', 'EGPCU', 'PRCS', 'FDPXX', 'ACGBF', 'SUSMF', 'AGLFQ', 'MGAWF', 'APWD', 'WELL78', 'ANORX', 'GNWTP', 'IRBGY', 'RCACX', 'ISPISX', 'NDGPY', 'UBQPBX', 'SBROF', 'OILFF', 'PGVTX', 'EFRMF', 'ECBDX', 'CNACX', 'JPYOZ', 'SSHZ', 'FAUIX', 'PKRDQ', 'AICIX', 'XCADX', 'BZFDW', 'SIVVU', 'PDVG', 'FSUVX', 'SWNM', 'AACPRXX', 'SPSS', 'WFHRX', 'MBGCF', 'BDH', 'YYCIV', 'HYNLZ', 'THLIX', 'FHNEAX', 'YQUFF', 'BIIVF', 'BYL923']


In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
from datetime import datetime
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
from pyspark.sql import Row
from pyspark.sql.functions import col, to_date, lit
from datetime import timedelta
from pyspark.sql.types import StructType, StructField, StringType, DateType
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import csv

imploded_stocks = pd.read_csv('filtered_tickers.csv')
imploded_stocks = imploded_stocks['Ticker'].tolist()

sp500_stocks = pd.read_csv('constituents.csv', usecols=['Symbol'])
sp500_stocks = sp500_stocks['Symbol'].tolist()

start_date = '2009-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')

def get_stock_price_weekly(ticker):
    # Suppress the progress message from yfinance
    temp_df = yf.download(ticker, start=start_date, end=end_date, progress=False)
    if temp_df.empty:
        #print("No data available for the specified date range.")
        return None
    weekly_data = temp_df['Adj Close'].resample('W').last()
    return weekly_data

def check_implosion(idx, firm_price, imp_thresh):
    i = idx
    start_price=firm_price.iloc[idx]
    i+=1
    period=0
    while i < len(firm_price):
        current_date = firm_price.index[i]
        current_price = firm_price.iloc[i]
        if (current_price-start_price)/start_price > -imp_thresh/2:
            return period
        period+=1
        i+=1
    return period

def get_crash_dates(firm_price, price_drop_thresh=-0.6, period_thresh=52):
    crash_dates = []
    imp_dates = []
    i = 52
    while i < len(firm_price):
        current_date = firm_price.index[i]
        current_price = firm_price.iloc[i]
        prev_year_data = firm_price.iloc[i-52:i]
        if len(prev_year_data) != 0:
            mean_price = prev_year_data.mean()
            if (current_price - mean_price)/mean_price < price_drop_thresh:
                imp_dates.append(current_date)
        i+=1
    return imp_dates

def get_implosion_dates(firm_price, price_drop_thresh=-0.6, period_thresh=52):
    crash_dates = []
    imp_dates = []
    i = 52
    while i < len(firm_price):
        current_date = firm_price.index[i]
        current_price = firm_price.iloc[i]
        prev_year_data = firm_price.iloc[i-52:i]
        if len(prev_year_data) != 0:
            mean_price = prev_year_data.mean()
            if (current_price - mean_price)/mean_price < price_drop_thresh:
                imp_period = check_implosion(i, firm_price,  price_drop_thresh)
                if imp_period > period_thresh:
                    imp_dates.append((current_date, firm_price.index[i+imp_period]))
                i+=imp_period
        i+=1
    return imp_dates

def plot_implosions(stock_series, imp_dates, ticker):
    plt.figure(figsize=(15, 5))
    plt.plot(stock_series.index, stock_series, label=ticker)
    for i in imp_dates:
        plt.axvspan(i[0], i[1], alpha=0.5, color='blue')
    plt.legend()
    plt.show()

def run_imps(stocks_list):
    num_imp = 0
    j = 0
    for t in stocks_list:
        stock_series = get_stock_price_weekly(t)
        if stock_series is not None:
            imp_dates = get_implosion_dates(stock_series)
            # if j % 10 == 0:
            #     plot_implosions(stock_series, imp_dates, t)    
            j+=1
            if len(imp_dates) >= 1:
                num_imp+=1
            # if len(imp_dates) ==0:
            #     plot_implosions(stock_series, imp_dates, t)
    print(f"{num_imp} out of {j} imploded")
    return num_imp

def plot_crashes(ticker):
    stock_series = get_stock_price_weekly(ticker)
    crash_dates = get_crash_dates(stock_series)
    plt.figure(figsize=(15, 5))
    plt.plot(stock_series.index, stock_series, label=ticker)
    for c in crash_dates:
        plt.axvspan(c,c, alpha=0.5, color='blue')
    plt.legend()
    plt.show()

def create_imploded_df(ticker_list):
    schema = StructType([StructField("Ticker", StringType(), True),
                     StructField("Implosion_Date", StringType(), True)])
    df = spark.createDataFrame([], schema=schema)
    for t in ticker_list:
        stock_series = get_stock_price_weekly(t)
        if stock_series is not None:
            imp_dates = get_implosion_dates(stock_series)
            if len(imp_dates)!=0:
                for date in imp_dates:
                    
                    date_str = pd.to_datetime(date[0]).strftime('%Y-%m-%d')
                    #date = to_date(date[0],'yyyy-MM-dd')
                    row = Row(Ticker=t, Implosion_Date=date_str)
                    df = df.union(spark.createDataFrame([row],schema=schema))
            # if len(imp_dates)!= 0: 
            #     for d in imp_dates:
            #         date = d[0]
            #         df = df.withColumn('Imploded',when((col('Ticker') == t) & (year('Date') == date.year) & (month('Date') == date.month),1
            #             ).otherwise(col('Imploded'))) 
    print(df.show(10))           
    df.toPandas().to_csv('imploded_tickers_dates.csv', index='False')
    

create_imploded_df(ticker_list)
#run_imps(['A'])
#add_labels_to_df('imploded_only.csv')
#plot_crashes('SEAC')
#APPN,CPS, FOSL, GRPN,PRLB, SEAC
#APPN has not imploded
#CPS has not imploded
#imploded: 377/433, sp500:  russell: 243/1754 imploded


1 Failed download:
- AMTCQ: No data found for this date range, symbol may be delisted

1 Failed download:
- FKLBDX: No data found for this date range, symbol may be delisted

1 Failed download:
- FCTFHX: No data found for this date range, symbol may be delisted

1 Failed download:
- FGLVJX: No data found for this date range, symbol may be delisted

1 Failed download:
- MS5044160: No data found for this date range, symbol may be delisted

1 Failed download:
- SPROW: No data found for this date range, symbol may be delisted

1 Failed download:
- CUVATX: No data found for this date range, symbol may be delisted

1 Failed download:
- JMPC: No data found for this date range, symbol may be delisted

1 Failed download:
- UBS4967480: No data found for this date range, symbol may be delisted

1 Failed download:
- EOBK: No data found for this date range, symbol may be delisted

1 Failed download:
- UBS5042838: No data found for this date range, symbol may be delisted

1 Failed download:
- PTICX

  if (current_price - mean_price)/mean_price < price_drop_thresh:
  if (current_price - mean_price)/mean_price < price_drop_thresh:



1 Failed download:
- SZQ220: No data found for this date range, symbol may be delisted

1 Failed download:
- KTXAX: No data found for this date range, symbol may be delisted

1 Failed download:
- FBESTX: No data found for this date range, symbol may be delisted

1 Failed download:
- JPMGX: No data found for this date range, symbol may be delisted

1 Failed download:
- BLLMF: No data found for this date range, symbol may be delisted

1 Failed download:
- JLGUX: No data found for this date range, symbol may be delisted

1 Failed download:
- SDRDW: No data found for this date range, symbol may be delisted

1 Failed download:
- SKMT: No data found for this date range, symbol may be delisted

1 Failed download:
- NEGS: No data found for this date range, symbol may be delisted

1 Failed download:
- QUATU: No data found for this date range, symbol may be delisted

1 Failed download:
- VCGH: No data found for this date range, symbol may be delisted

1 Failed download:
- CWLTQ: No data found f

In [None]:
# import csv

# def save_weekly_prices():
#     filtered_tickers = []  # A list to store the ticker names

#     for ticker in russell_stocks:
#         t_df = get_stock_price_weekly(ticker)
#         if t_df is not None:
#             max_price = max(t_df)  # Replace 'price' with the actual column name
#             if max_price >= 100:
#                 filtered_tickers.append(ticker)

#     csv_file_name = 'filtered_russell_tickers.csv'

#     # Open the CSV file in write mode
#     with open(csv_file_name, mode='w', newline='') as file:
#         writer = csv.writer(file)  # Create a CSV writer object

#         # Write the ticker names to the CSV file
#         writer.writerow(['Ticker'])  # Write a header row
#         for ticker in filtered_tickers:
#             writer.writerow([ticker])

# save_weekly_prices()
