In [12]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("List available databases and tables").getOrCreate()

# for local metastore (your private, invidivual database) add the following config to spark session
# from pyspark import SparkConf
# .config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()) \

spark.catalog.listDatabases()

[Database(name='2022_10_22', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='2023_04_01', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse')]

In [7]:
spark.sql("SHOW TABLES IN 2023_04_01").show(10)

+----------+--------------------+-----------+
| namespace|           tableName|isTemporary|
+----------+--------------------+-----------+
|2023_04_01|  affiliate_type_map|      false|
|2023_04_01|     asset_class_map|      false|
|2023_04_01|      audit_type_map|      false|
|2023_04_01|ca_div_freq_qual_map|      false|
|2023_04_01|     ca_div_type_map|      false|
|2023_04_01|   ca_event_type_map|      false|
|2023_04_01| ce_audio_source_map|      false|
|2023_04_01|   ce_event_type_map|      false|
|2023_04_01|ce_fiscal_period_map|      false|
|2023_04_01|  ce_market_time_map|      false|
+----------+--------------------+-----------+
only showing top 10 rows



In [17]:
imploded_stocks = pd.read_csv('filtered_tickers.csv')
imploded_stocks = imploded_stocks['Ticker'].tolist()
sp500_stocks = pd.read_csv('spx500.csv', usecols=['Symbol'])
sp500_stocks = sp500_stocks['Symbol'].tolist()

In [19]:
import random

random_seed = 42
random.seed(random_seed)
combined_stocks = imploded_stocks + sp500_stocks
random.shuffle(combined_stocks)


In [26]:
import yfinance as yf
from datetime import datetime

def pull_stock_monthly(stock_list, start_date='2009-01-01', end_date=datetime.now().strftime('%Y-%m-%d')):
    all_series = []
    for t in stock_list:
        df = yf.download(t, start=start_date, end=end_date, progress=False)
        if not(df.empty):
            monthly_df = df['Adj Close'].resample('M').last().to_frame()
            monthly_df['Ticker'] = t
            all_series.append(monthly_df)
    combined_df = pd.concat(all_series, axis=0)
    combined_df.to_csv('imploded_sp500.csv')
        

pull_stock_monthly(combined_stocks)


1 Failed download:
- DISCK: No data found for this date range, symbol may be delisted

1 Failed download:
- FISV: No data found for this date range, symbol may be delisted

1 Failed download:
- YVR: No data found for this date range, symbol may be delisted

1 Failed download:
- DRE: No data found for this date range, symbol may be delisted

1 Failed download:
- FB: No data found for this date range, symbol may be delisted

1 Failed download:
- FBHS: No data found for this date range, symbol may be delisted

1 Failed download:
- INFO: No data found for this date range, symbol may be delisted

1 Failed download:
- CTXS: No data found for this date range, symbol may be delisted

1 Failed download:
- NLSN: No data found for this date range, symbol may be delisted

1 Failed download:
- CFMS: No data found for this date range, symbol may be delisted

1 Failed download:
- PBCT: No data found for this date range, symbol may be delisted

1 Failed download:
- DISCA: No data found for this date 

In [10]:
def pull_fundamental(ticker):
    spark.sql("USE 2023_04_01")
    # Assuming that 'ticker' is a valid Python variable

    query = f"""SELECT d.ticker_region, a.date, a.ff_assets 
                FROM FF_BASIC_QF a 
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id 
                WHERE d.ticker_region = "{ticker}-US" 
                ORDER BY a.date"""

    df = spark.sql(query)

    df.show(100, False)
    
pull_fundamental('ABEO')

#get sp500 stocks
#get imploded stocks
#randomise order
#get monthly dates
#returns
    

+-------------+----------+---------+
|ticker_region|date      |ff_assets|
+-------------+----------+---------+
|ABEO-US      |1998-03-31|1.054    |
|ABEO-US      |1998-06-30|3.046    |
|ABEO-US      |1998-09-30|3.046    |
|ABEO-US      |1998-12-31|2.351    |
|ABEO-US      |1999-03-31|1.707    |
|ABEO-US      |1999-06-30|1.135    |
|ABEO-US      |1999-09-30|5.367    |
|ABEO-US      |1999-12-31|4.6      |
|ABEO-US      |2000-03-31|18.229   |
|ABEO-US      |2000-06-30|17.195   |
|ABEO-US      |2000-09-30|31.638   |
|ABEO-US      |2000-12-31|31.571   |
|ABEO-US      |2001-03-31|30.32    |
|ABEO-US      |2001-06-30|28.851   |
|ABEO-US      |2001-09-30|27.498   |
|ABEO-US      |2001-12-31|26.532   |
|ABEO-US      |2002-03-31|26.068   |
|ABEO-US      |2002-06-30|23.927   |
|ABEO-US      |2002-09-30|21.076   |
|ABEO-US      |2002-12-31|20.532   |
|ABEO-US      |2003-03-31|17.788   |
|ABEO-US      |2003-06-30|17.385   |
|ABEO-US      |2003-09-30|14.074   |
|ABEO-US      |2003-12-31|12.856   |
|