In [2]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("Building dataset").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session
spark.sql("USE 2023_04_01")

DataFrame[]

In [6]:
# import pyspark.pandas as ps
# from pyspark.sql.functions import lit
import pandas as pd
from fredapi import Fred

def get_macro_features():
    fred_key = 'bdfdde3b7a21b7d528011d17996b0b8e'
    fred = Fred(api_key=fred_key)
    cpi = fred.get_series(series_id='CPIAUCSL')
    cpi_change = cpi.pct_change()
    unemp = fred.get_series(series_id='UNRATE')
    gdp = fred.get_series(series_id='GDP')
    gdp_change = gdp.pct_change()
    df = pd.DataFrame({'CPI_change': cpi_change,'Unemployment_Rate': unemp,'GDP_change': gdp_change})
    df.to_csv('macro.csv')

def get_all_stocks():
    query = f"""SELECT s.ticker_region, sc.fref_listing_exchange FROM sym_ticker_region s 
                LEFT JOIN FF_SEC_COVERAGE c ON c.fsym_id = s.fsym_id
                LEFT JOIN sym_coverage sc ON sc.fsym_id = s.fsym_id
                WHERE s.ticker_region LIKE "%-US" AND s.ticker_region NOT LIKE '%.%' AND c.CURRENCY = "USD"
                AND (sc.fref_listing_exchange = "NAS" OR sc.fref_listing_exchange = "NYS")"""
    df = spark.sql(query)
    df = df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
    ticker_list = [row.ticker_region for row in df.collect()]
    return ticker_list

def get_stocks_query():
    query = f"""SELECT s.ticker_region, s.fsym_id FROM sym_ticker_region s 
                LEFT JOIN FF_SEC_COVERAGE c ON c.fsym_id = s.fsym_id
                LEFT JOIN sym_coverage sc ON sc.fsym_id = s.fsym_id
                WHERE s.ticker_region LIKE "%-US" AND s.ticker_region NOT LIKE '%.%' AND c.CURRENCY = "USD"
                AND (sc.fref_listing_exchange = "NAS" OR sc.fref_listing_exchange = "NYS")"""
    df = spark.sql(query)
    print("stocks obtained")
    return df


def get_implosion_df(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date'])
    return df

def get_features_for_imploded_stocks(df, big_string):
    df=spark.createDataFrame(df)
    df.createOrReplaceTempView("temp_table")
    # query = """SELECT t.Ticker, t.Implosion_Date, t.Implosion_Next_Year, a.date, a.ff_gross_inc, b.date, b.ff_gross_inc, c.date, c.ff_gross_inc
    #             FROM temp_table t 
    #             LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
    #             LEFT JOIN FF_BASIC_AF a ON s.fsym_id = a.fsym_id AND YEAR(a.date) = YEAR(t.Implosion_Date)-1
    #             LEFT JOIN FF_BASIC_AF b ON s.fsym_id = b.fsym_id AND YEAR(b.date) = YEAR(t.Implosion_Date)-2
    #             LEFT JOIN FF_BASIC_AF c ON s.fsym_id = c.fsym_id AND YEAR(c.date) = YEAR(t.Implosion_Date)-3
    #             ORDER BY t.Ticker, a.date
    # """
    query = f"""SELECT t.Ticker, a.date, {big_string}, t.Implosion_Next_Year FROM temp_table t
                    LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
                    LEFT JOIN FF_ADVANCED_AF a ON a.fsym_id = s.fsym_id AND YEAR(a.date) = t.Year
                    ORDER BY t.Ticker, a.date
    """
    df2 = spark.sql(query)
    return df2
    
    
def get_features_for_non_imploded(metric_string, metric_string2):
    df = get_stocks_query()
    df.createOrReplaceTempView("temp_table")
    query = f"""WITH RankedData AS (
    SELECT
        t.ticker_region,
        a.date,
        {metric_string},
        ROW_NUMBER() OVER (PARTITION BY t.ticker_region ORDER BY a.date DESC) AS row_num
    FROM
        temp_table t
        LEFT JOIN FF_ADVANCED_AF a ON a.fsym_id = t.fsym_id
)
SELECT
    ticker_region AS Ticker,
    date,
    {metric_string2}
FROM
    RankedData r
WHERE
    row_num <= 4
ORDER BY
    ticker_region,
    date"""
    new_df = spark.sql(query)
    return new_df


def create_non_imploded_ds():
    df_metrics = ps.DataFrame(spark.sql("SELECT * FROM FF_ADVANCED_AF LIMIT 10")) #get all the metrics
    cols = []
    for c in df_metrics.columns:
        if df_metrics[c].dtype=='float64':#get all the metrics we can calculate correlations with
            cols.append(c)
    metric_string = ', '.join('a.' + item for item in cols)
    metric_string2 = ', '.join('r.' + item for item in cols)
    df = get_features_for_non_imploded(metric_string, metric_string2)
    df = df.withColumn("Implosion_Next_Year", lit(0))
    return df

def create_imploded_df():
    df = get_implosion_df('imploded_stocks.csv')
    df = df.drop(df.columns[0], axis=1)
    df['Implosion_Year'] = df['Implosion_Date'].dt.year
    df['Implosion_Next_Year'] = 1
    additional_rows_1 = df.copy()
    additional_rows_1['Implosion_Year'] = df['Implosion_Year'] - 1
    additional_rows_1['Implosion_Next_Year'] = 0
    additional_rows_2 = df.copy()
    additional_rows_2['Implosion_Year'] = df['Implosion_Year'] - 2
    additional_rows_2['Implosion_Next_Year'] = 0
    additional_rows_3 = df.copy()
    additional_rows_3['Implosion_Year'] = df['Implosion_Year'] - 3
    additional_rows_3['Implosion_Next_Year'] = 0
    df = pd.concat([df, additional_rows_1, additional_rows_2, additional_rows_3])
    df = df.sort_values(by=['Ticker', 'Implosion_Year'])
    df = df.reset_index(drop=True)
    df =df.rename({'Implosion_Year' : 'Year'},axis=1)
    
    df_metrics = ps.DataFrame(spark.sql("SELECT * FROM FF_ADVANCED_AF LIMIT 10")) #get all the metrics
    cols = []
    for c in df_metrics.columns:
        if df_metrics[c].dtype=='float64':#get all the metrics we can calculate correlations with
            cols.append(c)
    metric_string = ', '.join('a.' + item for item in cols)
    
    df = get_features_for_imploded_stocks(df, metric_string)
    
    return df
    
    

def create_dataset():
    # df = get_implosion_df('imploded_stocks.csv')
    # df = df.drop(df.columns[0], axis=1)
    # df['Implosion_Year'] = df['Implosion_Date'].dt.year
    # df['Implosion_Next_Year'] = 1
    # get_features_for_imploded_stocks(df)
    #print(df.head())
    #df=spark.createDataFrame(df)
    #df.createOrReplaceTempView("temp_table")
    non_imp_df =create_non_imploded_ds().toPandas()
    imp_df = create_imploded_df().toPandas()
    result_df = pd.concat([non_imp_df,imp_df], ignore_index=True)
    result_df=result_df.sort_values(by=['Ticker','date'])
    print(result_df.isnull().sum())
    # result_df=pd.DataFrame(result_df)
    #result_df.to_csv('Advanced_AF_Dataset.csv', index=False)
    
    

    
#create_dataset()
get_macro_features()

                 CPI  Unemployment Rate       GDP
2023-06-01  0.001804                3.6       NaN
2023-07-01  0.001669                3.5  0.020712
2023-08-01  0.006312                3.8       NaN
2023-09-01  0.003957                3.8       NaN
2023-10-01  0.000449                3.9       NaN
