## Building the dataset that will be input into the model

In [1]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("Building dataset").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session
spark.sql("USE 2023_04_01")

DataFrame[]

In [7]:
import pyspark.pandas as ps
from pyspark.sql.functions import lit,col
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
#from boruta import BorutaPy
#from fredapi import Fred
from sklearn.linear_model import Lasso
from sklearn.model_selection import TimeSeriesSplit

def get_macro_features():
    # fred_key = 'bdfdde3b7a21b7d528011d17996b0b8e'
    # fred = Fred(api_key=fred_key)
    # cpi = fred.get_series(series_id='CPIAUCSL')
    # cpi_change = cpi.pct_change()
    # unemp = fred.get_series(series_id='UNRATE')
    # gdp = fred.get_series(series_id='GDP')
    # gdp_change = gdp.pct_change()
    # df = pd.DataFrame({'CPI_change': cpi_change,'Unemployment_Rate': unemp,'GDP_change': gdp_change})
    # df.to_csv('macro.csv')
    df = pd.read_csv('macro.csv')
    return df

def get_all_stocks():
    query = f"""SELECT s.ticker_region, sc.fref_listing_exchange FROM sym_ticker_region s 
                LEFT JOIN FF_SEC_COVERAGE c ON c.fsym_id = s.fsym_id
                LEFT JOIN sym_coverage sc ON sc.fsym_id = s.fsym_id
                WHERE s.ticker_region LIKE "%-US" AND s.ticker_region NOT LIKE '%.%' AND c.CURRENCY = "USD"
                AND (sc.fref_listing_exchange = "NAS" OR sc.fref_listing_exchange = "NYS")"""
    df = spark.sql(query)
    df = df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
    ticker_list = [row.ticker_region for row in df.collect()]
    return ticker_list



def get_non_imp_stocks_query():
    df2 = spark.createDataFrame(get_implosion_df('imploded_stocks.csv'))
    df2.createOrReplaceTempView("imp_table")
    query = f"""SELECT s.ticker_region, s.fsym_id FROM sym_ticker_region s 
                LEFT JOIN FF_SEC_COVERAGE c ON c.fsym_id = s.fsym_id
                LEFT JOIN sym_coverage sc ON sc.fsym_id = s.fsym_id
                WHERE s.ticker_region LIKE "%-US" AND s.ticker_region NOT LIKE '%.%' AND c.CURRENCY = "USD"
                AND (sc.fref_listing_exchange = "NAS" OR sc.fref_listing_exchange = "NYS")
                AND NOT EXISTS (
                SELECT 1
                FROM imp_table
                WHERE s.ticker_region = CONCAT(imp_table.Ticker, '-US') )    
                """
    df = spark.sql(query)
    print("got non imploded stocks")
    return df


def get_implosion_df(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date'])
    return df

def get_features_for_imploded_stocks(df, big_string, table):
    df=spark.createDataFrame(df)
    df.createOrReplaceTempView("temp_table")
    # query = """SELECT t.Ticker, t.Implosion_Date, t.Implosion_Next_Year, a.date, a.ff_gross_inc, b.date, b.ff_gross_inc, c.date, c.ff_gross_inc
    #             FROM temp_table t 
    #             LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
    #             LEFT JOIN FF_BASIC_AF a ON s.fsym_id = a.fsym_id AND YEAR(a.date) = YEAR(t.Implosion_Date)-1
    #             LEFT JOIN FF_BASIC_AF b ON s.fsym_id = b.fsym_id AND YEAR(b.date) = YEAR(t.Implosion_Date)-2
    #             LEFT JOIN FF_BASIC_AF c ON s.fsym_id = c.fsym_id AND YEAR(c.date) = YEAR(t.Implosion_Date)-3
    #             ORDER BY t.Ticker, a.date
    # """
    query = f"""SELECT t.Ticker, a.date, b.FF_PRICE_CLOSE_FP, {big_string}, t.Implosion_Next_Year FROM temp_table t
                    LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
                    LEFT JOIN {table} a ON a.fsym_id = s.fsym_id AND YEAR(a.date) = t.Year
                    LEFT JOIN FF_BASIC_AF b ON b.fsym_id = s.fsym_id AND YEAR(b.date) = t.Year
                    ORDER BY t.Ticker, a.date
    """
    df2 = spark.sql(query)
    print("imploded query done")
    return df2
    
    
def get_features_for_non_imploded(metric_string, metric_string2,table):
    df = get_non_imp_stocks_query()
    df.createOrReplaceTempView("temp_table")
    query = f"""WITH RankedData AS (
    SELECT
        t.ticker_region, t.fsym_id,
        a.date,
        {metric_string},
        ROW_NUMBER() OVER (PARTITION BY t.ticker_region ORDER BY a.date DESC) AS row_num
        FROM temp_table t
        LEFT JOIN {table} a ON a.fsym_id = t.fsym_id
        WHERE YEAR(a.date) < 2023 )
    SELECT
        r.ticker_region AS Ticker, r.date,  b.FF_PRICE_CLOSE_FP, {metric_string2}
        FROM RankedData r
        LEFT JOIN FF_BASIC_AF b ON b.fsym_id = r.fsym_id AND YEAR(b.date) = YEAR(r.date)
        WHERE row_num <= 1
        ORDER BY ticker_region, date"""
    new_df = spark.sql(query)
    print("non imploded query done")
    return new_df


def create_non_imploded_ds(table):
    df_metrics = ps.DataFrame(spark.sql(f"SELECT * FROM {table} LIMIT 10")) #get all the metrics
    # cols = []
    # for c in df_metrics.columns:
    #     if df_metrics[c].dtype=='float64':#get all the metrics we can calculate correlations with
    #         cols.append(c)
    cols = ['ff_debt_entrpr_val', 'ff_tot_debt_tcap_std', 'ff_fix_assets_com_eq', 'ff_debt_eq', 'ff_debt_com_eq', 'ff_inven_curr_assets', 'ff_ltd_com_eq', 'ff_liabs_lease', 'ff_ltd_tcap', 'ff_sales_wkcap',
           'ff_bps_gr', 'ff_oper_inc_tcap', 'ff_assets_gr', 'ff_fcf_yld', 'ff_mkt_val_gr', 'ff_earn_yld', 'ff_pbk_tang', 'ff_zscore', 'ff_entrpr_val_sales', 'ff_psales_dil'] #advanced_der_qf
    
    metric_string = ', '.join('a.' + item for item in cols)
    metric_string2 = ', '.join('r.' + item for item in cols)
    df = get_features_for_non_imploded(metric_string, metric_string2, table)
    df = df.withColumn("Implosion_Next_Year", lit(0))
    return df

def create_imploded_df(table):
    df = get_implosion_df('imploded_stocks.csv')
    df = df.drop(df.columns[0], axis=1)
    df['Implosion_Year'] = df['Implosion_Date'].dt.year-1
    df['Implosion_Next_Year'] = 1
    # additional_rows_1 = df.copy()
    # additional_rows_1['Implosion_Year'] = df['Implosion_Year'] - 1
    # additional_rows_1['Implosion_Next_Year'] = 0
    # additional_rows_2 = df.copy()
    # additional_rows_2['Implosion_Year'] = df['Implosion_Year'] - 2
    # additional_rows_2['Implosion_Next_Year'] = 0
    # additional_rows_3 = df.copy()
    # additional_rows_3['Implosion_Year'] = df['Implosion_Year'] - 3
    # additional_rows_3['Implosion_Next_Year'] = 0
    # df = pd.concat([df, additional_rows_1, additional_rows_2, additional_rows_3])
    df = df.sort_values(by=['Ticker', 'Implosion_Year'])
    df = df.reset_index(drop=True)
    df =df.rename({'Implosion_Year' : 'Year'},axis=1)
    
    # df_metrics = ps.DataFrame(spark.sql(f"SELECT * FROM {table} LIMIT 10")) #get all the metrics
    # cols = []
    # for c in df_metrics.columns:
    #     if df_metrics[c].dtype=='float64':#get all the metrics we can calculate correlations with
    #         cols.append(c)
    
    cols = ['ff_debt_entrpr_val', 'ff_tot_debt_tcap_std', 'ff_fix_assets_com_eq', 'ff_debt_eq', 'ff_debt_com_eq', 'ff_inven_curr_assets', 'ff_ltd_com_eq', 'ff_liabs_lease', 'ff_ltd_tcap', 'ff_sales_wkcap',
           'ff_bps_gr', 'ff_oper_inc_tcap', 'ff_assets_gr', 'ff_fcf_yld', 'ff_mkt_val_gr', 'ff_earn_yld', 'ff_pbk_tang', 'ff_zscore', 'ff_entrpr_val_sales', 'ff_psales_dil'] #advanced_der_qf
    
    metric_string = ', '.join('a.' + item for item in cols)
    
    df = get_features_for_imploded_stocks(df, metric_string, table)
    
    return df


def boruta_fs():
    result_df = pd.read_csv('Advanced_AF_DER_Dataset.csv', index_col=None)
    X = result_df.drop(['Ticker', 'date', 'Implosion_Next_Year'], axis=1)
    Y = result_df['Implosion_Next_Year']
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=42)
    
    model = xgb.XGBClassifier()
    feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)
    feat_selector.fit(X_train, y_train)
    
    print(feat_selector.support_)
    print(feat_selector.ranking_)
    
    
def lasso_fs():
    result_df = pd.read_csv('Advanced_AF_DER_Dataset.csv', index_col=None)
    result_df['date'] = pd.to_datetime(result_df['date'])
    result_df['Year'] = result_df['date'].dt.year
    result_df['Month'] = result_df['date'].dt.month
    result_df['DayOfWeek'] = result_df['date'].dt.dayofweek
    X = result_df.drop(['Ticker', 'date', 'Implosion_Next_Year'], axis=1)
    cols = X.columns
    print(X.head())
    Y = result_df['Implosion_Next_Year']

    tscv = TimeSeriesSplit(n_splits=5)

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        alpha = 0.01
        lasso_model = Lasso(alpha=alpha)
        lasso_model.fit(X_train_scaled, y_train)

        feature_importances = lasso_model.coef_

        feature_importance_df = pd.DataFrame({'Feature': cols, 'Importance': feature_importances})

        feature_importance_df['Absolute Importance'] = feature_importance_df['Importance'].abs()
        feature_importance_df = feature_importance_df.sort_values(by='Absolute Importance', ascending=False).drop('Absolute Importance', axis=1)
        print(f'Feature Importances for Fold {tscv.get_n_splits()}:')
        print(feature_importance_df)
    

def create_dataset(table):
    # df = get_implosion_df('imploded_stocks.csv')
    # df = df.drop(df.columns[0], axis=1)
    # df['Implosion_Year'] = df['Implosion_Date'].dt.year
    # df['Implosion_Next_Year'] = 1
    # get_features_for_imploded_stocks(df)
    #print(df.head())
    #df=spark.createDataFrame(df)
    #df.createOrReplaceTempView("temp_table")
    
    imp_df = create_imploded_df(table).toPandas()
    non_imp_df =create_non_imploded_ds(table).toPandas()
    result_df = pd.concat([non_imp_df,imp_df], ignore_index=True)
    #print(result_df.head())
    result_df['date'] = pd.to_datetime(result_df['date'], format='%Y-%m-%d')
    result_df=result_df.sort_values(by=['Ticker','date'])
    macro_df = get_macro_features().reset_index()
    macro_df['Date'] = pd.to_datetime(macro_df['Date'], format='%d/%m/%Y')
    #print(macro_df.head())
    result_df['month_year'] = result_df['date'].dt.to_period("M")
    macro_df['Month_year'] = macro_df['Date'].dt.to_period("M")
    result_df = pd.merge(result_df, macro_df, left_on='month_year', right_on='Month_year', how='left')
    result_df.drop(['Date', 'index', 'month_year','Month_year','GDP'],axis=1,inplace=True)
    
    print(result_df.head())
    
    null_pcts = result_df.isnull().sum()/len(result_df)
    
    cols_to_drop = null_pcts[null_pcts > 0.5].index.tolist()
    result_df.drop(cols_to_drop,axis=1,inplace=True)
    print("dropped cols: ", cols_to_drop)
    
    result_df=pd.DataFrame(result_df)
    print("before dropping nulls: ",len(result_df))
    result_df = result_df.dropna()
    print("after dropping nulls: ", len(result_df))
    print("number of implosions: ", len(result_df[result_df['Implosion_Next_Year']==1]))
    print("number of non-implosions: ", len(result_df[result_df['Implosion_Next_Year']==0]))
    result_df.to_csv('Advanced_AF_DER_Dataset.csv', index=False)
    print("dataset written")
    
#     result_df = ps.DataFrame(result_df)
#     X = result_df.drop(['Ticker', 'date', 'Implosion_Next_Year'], axis=1)
#     y = result_df['Implosion_Next_Year']
#     scaler = StandardScaler()
#     scaler.fit(X)
#     X=scaler.transform(X)
#     X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=42)
    
#     model = xgb.XGBClassifier()
#     feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)
#     feat_selector.fit(X_train, y_train)
    
#     print(feat_selector.support_)
#     print(feat_selector.ranking_)
    
    
    
    
    

    
create_dataset('FF_ADVANCED_DER_AF')
#boruta_fs()
lasso_fs()

imploded query done
got non imploded stocks
non imploded query done
    Ticker       date  FF_PRICE_CLOSE_FP  ff_debt_entrpr_val  \
0     A-US 2022-10-31          138.35001            0.067346   
1    AA-US 2022-12-31           45.47000            0.189013   
2  AAAP-US 2016-12-31           26.76000            0.019984   
3   AAB-US 1998-12-31           20.68800            0.003673   
4  AABC-US 2004-12-31           14.51000            0.550329   

   ff_tot_debt_tcap_std  ff_fix_assets_com_eq  ff_debt_eq  ff_debt_com_eq  \
0             35.509361             23.562677   55.061263       55.061263   
1             27.194492            127.915682   37.352246       37.352246   
2              5.167838             21.343347    5.449458        5.449458   
3              1.672403             37.411357    1.700848        1.700848   
4             41.452296             39.513874   70.800875       70.800875   

   ff_inven_curr_assets  ff_ltd_com_eq  ...  ff_fcf_yld  ff_mkt_val_gr  \
0         