In [4]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("List available databases and tables").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session

spark.catalog.listDatabases()

[Database(name='2022_10_22', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='2023_04_01', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse')]

In [5]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import when

spark.sql("USE 2023_04_01")


DataFrame[]

In [6]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import when
import pyspark.pandas as ps
import numpy as np
from scipy.stats import zscore



def query(ticker):
    query = f"""SELECT d.ticker_region, a.date
                FROM FF_ADVANCED_AF a 
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id 
                WHERE d.ticker_region = "{ticker}-US"
                ORDER BY a.date
                """

    fund_df = spark.sql(query)
    fund_df = fund_df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
    
    return fund_df

def get_top_bottom_ten(df):
    df = df.sort_values(by='Value')
    df=df.dropna()
    top10 = df.head(10)
    down10 = df.tail(10)
    print(top10,down10)
    return top10['Metric'].tolist(), down10['Metric'].tolist()

def avg_change_df(df, big_string, big_string2, avg_string):
    df=spark.createDataFrame(df)
    df.createOrReplaceTempView("temp_table")
    
    query1 = f"""
                SELECT t.Ticker, {big_string}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF a ON s.fsym_id = a.fsym_id AND a.date < t.Implosion_Date
                GROUP BY t.Ticker
                ORDER BY t.Ticker
            """
    query2 = f"""
                SELECT {avg_string}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF a ON s.fsym_id = a.fsym_id AND a.date > t.Implosion_Date
                GROUP BY t.Ticker
                ORDER BY t.Ticker
            """
    df1 = spark.sql(query1)
    df2 = spark.sql(query2)
    print(df1.show(5))
    
    df1 = df1.toPandas()
    df2 = df2.toPandas()
    
    non_string_columns = df1.select_dtypes(exclude=['object']).columns
    df1 = df1[non_string_columns]
    df2 = df2[non_string_columns]
    
    null_threshold = 200
    columns_to_drop = df1.columns[df1.isnull().sum() > null_threshold]
    df1 = df1.drop(columns=columns_to_drop)
    df2 = df2.drop(columns=columns_to_drop)
    # print("NULLS:")
    # print(df1.isnull().sum())
    # print(df2.isnull().sum())
    
    percentage_change_df = ((df1 - df2) / df2) * 100
    #print(percentage_change_df)
    #print("LENGTH: ",len(percentage_change_df))
    
    
    metric_dict = {}
    for column in percentage_change_df.columns:
        percentage_change_df[column] = percentage_change_df[column].replace([np.inf, -np.inf], np.nan)
        new_col = percentage_change_df[column].dropna()
        mean_val = new_col.mean()
        stddev_val = new_col.std()
        z_score_threshold = 3.0
        new_col = new_col[(new_col >= mean_val - z_score_threshold * stddev_val) &
        (new_col <= mean_val + z_score_threshold * stddev_val)]
        #if new_col.std() < 1000:
        metric_dict[column] = new_col.mean()
    #print(metric_dict)
    metric_df = pd.DataFrame(list(metric_dict.items()), columns=['Metric', 'Value'])
    #metric_df.to_csv('ChangesBeforeImplosionA4yrs.csv', index=False)
    return metric_df

def pct_change_df(df, big_string, big_string2):
    df=spark.createDataFrame(df)
    df.createOrReplaceTempView("temp_table")
    
    query1 = f"""
                SELECT {big_string}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF a ON s.fsym_id = a.fsym_id AND YEAR(a.date) = YEAR(t.Implosion_Date)
                ORDER BY t.Ticker
            """
    query2 = f"""
                SELECT {big_string2}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF b ON s.fsym_id = b.fsym_id AND YEAR(b.date) = YEAR(t.Implosion_Prev4Years)
                ORDER BY t.Ticker
            """
    df1 = spark.sql(query1)
    df2 = spark.sql(query2)
    
    df1 = df1.toPandas()
    df2 = df2.toPandas()
    
    non_string_columns = df1.select_dtypes(exclude=['object']).columns
    df1 = df1[non_string_columns]
    df2 = df2[non_string_columns]
    
    null_threshold = 200
    columns_to_drop = df1.columns[df1.isnull().sum() > null_threshold]
    df1 = df1.drop(columns=columns_to_drop)
    df2 = df2.drop(columns=columns_to_drop)
    # print("NULLS:")
    # print(df1.isnull().sum())
    # print(df2.isnull().sum())
    
    percentage_change_df = ((df1 - df2) / df2) * 100
    #print(percentage_change_df)
    #print("LENGTH: ",len(percentage_change_df))
    
    
    metric_dict = {}
    for column in percentage_change_df.columns:
        percentage_change_df[column] = percentage_change_df[column].replace([np.inf, -np.inf], np.nan)
        new_col = percentage_change_df[column].dropna()
        mean_val = new_col.mean()
        stddev_val = new_col.std()
        z_score_threshold = 3.0
        new_col = new_col[(new_col >= mean_val - z_score_threshold * stddev_val) &
        (new_col <= mean_val + z_score_threshold * stddev_val)]
        #if new_col.std() < 1000:
        metric_dict[column] = new_col.mean()
    #print(metric_dict)
    metric_df = pd.DataFrame(list(metric_dict.items()), columns=['Metric', 'Value'])
    #metric_df.to_csv('ChangesBeforeImplosionA4yrs.csv', index=False)
    return metric_df
    
    # df['pct_change'] = (df[metric_curr] - df[metric_prev])/df[metric_prev]
    # df['pct_change'] = df['pct_change'].replace([np.inf, -np.inf], np.nan) 
    # df=df.dropna(axis=0)
    # mean_val = df['pct_change'].mean()
    # stddev_val = df['pct_change'].std()
    # z_score_threshold = 3.0
    # df = df[
    # (df['pct_change'] >= mean_val - z_score_threshold * stddev_val) &
    # (df['pct_change'] <= mean_val + z_score_threshold * stddev_val)]
    # new_mean = df['pct_change'].mean()
    # return new_mean




def get_metric_changes(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date']).dt.date
    df_metrics = spark.sql("SELECT * FROM FF_ADVANCED_AF LIMIT 10")
    df_metrics = df_metrics.columns[:5]
    result_string = ', '.join('a.' + item for item in df_metrics)
    result_string2 = ', '.join('b.' + item for item in df_metrics)
    result_list = result_string.split(',')
    avg_string =  [f'AVG({element})' for element in result_list]
    avg_string = ', '.join(avg_string)
    top10s = []
    bottom10s = []
    for y in range(1,2):
        #df['Implosion_Prev4Years'] = df['Implosion_Date'] - pd.DateOffset(years=y)
    #print(result_string)
        new_df = avg_change_df(df, result_string, result_string2, avg_string)
        top10, bottom10 = get_top_bottom_ten(new_df)
        top10s.append(top10)
        bottom10s.append(bottom10)
    return top10s,bottom10s


# tops, bottoms = get_metric_changes('imploded_tickers_dates.csv')




In [14]:
tops

[['avg(ff_amort_dfd_chrg)',
  'avg(ff_amort_intang)',
  'avg(ff_amort_cf)',
  'avg(ff_assets_intl)']]

In [39]:
def consistent_changes(ticker, big_string):
    start_date = pd.to_datetime("2009-01-01")
    query1 = f"""
                SELECT b.FF_PRICE_CLOSE_FP, {big_string}
                FROM FF_ADVANCED_AF a
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id
                LEFT JOIN FF_BASIC_QF b ON b.date=a.date AND d.fsym_id=b.fsym_id
                WHERE d.ticker_region = "{ticker}-US" 
                AND a.date >= "{start_date}"
                ORDER BY a.date
            """
    q_df = spark.sql(query1)
    print("query done")
    q_df = ps.DataFrame(q_df)
    non_string_columns = q_df.select_dtypes(exclude=['object']).columns
    q_df = q_df[non_string_columns]
    print("filtering done")
    # null_threshold = 10
    # columns_to_drop = q_df.columns[q_df.isnull().sum() > null_threshold]
    # q_df = q_df.drop(columns=columns_to_drop)
    correlations = q_df.corr()['FF_PRICE_CLOSE_FP']
    print("corr done")
    return correlations

def corr_query(implosion_df, col_string):
    df=spark.createDataFrame(implosion_df)
    df.createOrReplaceTempView("temp_table")
    start_date = pd.to_datetime("2009-01-01")
    query1 = f"""
                SELECT t.Ticker, {col_string}
                FROM temp_table t
                LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
                LEFT JOIN FF_ADVANCED_QF a ON s.fsym_id = a.fsym_id
                LEFT JOIN FF_BASIC_QF b ON b.date=a.date AND s.fsym_id=b.fsym_id
                WHERE a.date >= "{start_date}"
                GROUP BY t.Ticker
                ORDER BY t.Ticker
            """
    # query1 = f"""
    #             SELECT t.Ticker, {col_string}
    #             FROM temp_table t
    #             LEFT JOIN sym_ticker_region s ON s.ticker_region = CONCAT(t.Ticker, '-US')
    #             LEFT JOIN FF_BASIC_QF a ON s.fsym_id=a.fsym_id
    #             WHERE a.date >= "{start_date}"
    #             GROUP BY t.Ticker
    #             ORDER BY t.Ticker
    #         """
    q_df = spark.sql(query1)
    q_df = ps.DataFrame(q_df)
    
    mean_vals = q_df.mean()
    mean_vals = mean_vals.sort_values()
    return mean_vals

def corr_analysis(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date']).dt.date
    df_metrics = ps.DataFrame(spark.sql("SELECT * FROM FF_ADVANCED_QF LIMIT 10")) #get all the metrics
    cols = []
    for c in df_metrics.columns:
        if df_metrics[c].dtype=='float64':#get all the metrics we can calculate correlations with
            cols.append(c)
    corr_string = ', '.join('CORR(a.' + item + ', FF_PRICE_CLOSE_FP)' for item in cols) #make a dynamic string for the SQL query
    mean_vals=corr_query(df, corr_string)
    mean_vals=mean_vals.dropna()
    print("Top 10: ", mean_vals.head(10))
    print("Bottom 10: ", mean_vals.tail(10))
    
    
        
    
    
    
corr_analysis('imploded_stocks.csv')

Top 10:  corr(ff_loan_for, FF_PRICE_CLOSE_FP)               -1.000000
corr(ff_ins_rsrv, FF_PRICE_CLOSE_FP)               -0.767037
corr(ff_shs_repurch_auth_shs, FF_PRICE_CLOSE_FP)   -0.605042
corr(ff_com_eq_unearn_comp, FF_PRICE_CLOSE_FP)     -0.591294
corr(ff_pol_claims, FF_PRICE_CLOSE_FP)             -0.469525
corr(ff_com_eq_apic, FF_PRICE_CLOSE_FP)            -0.372273
corr(ff_com_shs_out_secs, FF_PRICE_CLOSE_FP)       -0.362745
corr(ff_accel_dep, FF_PRICE_CLOSE_FP)              -0.357423
corr(ff_ppe_dep, FF_PRICE_CLOSE_FP)                -0.329758
corr(ff_intang_oth_amort, FF_PRICE_CLOSE_FP)       -0.303841
dtype: float64
Bottom 10:  corr(ff_dps_secs, FF_PRICE_CLOSE_FP)            0.379162
corr(ff_loan_brkr, FF_PRICE_CLOSE_FP)           0.394672
corr(ff_bps_secs, FF_PRICE_CLOSE_FP)            0.440450
corr(ff_loan_bk, FF_PRICE_CLOSE_FP)             0.489035
corr(ff_int_inc_deps, FF_PRICE_CLOSE_FP)        0.495577
corr(ff_tot_invest_ret, FF_PRICE_CLOSE_FP)      0.497328
corr(ff_asse

In [13]:
def get_total_annual_returns(ticker):
    start_date = pd.to_datetime("2009-01-01")
    query1 = f"""
                SELECT P_DIVS_EXDATE, P_DIVS_PD
                FROM fp_basic_dividends a
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id
                WHERE d.ticker_region = "{ticker}-US" 
                AND a.P_DIVS_EXDATE >= "{start_date}"
                ORDER BY a.P_DIVS_EXDATE
            """
    df = spark.sql(query1)
    df.show(10)

def get_ticker_df(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date']).dt.date
    return df

def get_all_stocks():
    query = f"""SELECT s.ticker_region, sc.fref_listing_exchange FROM sym_ticker_region s 
                LEFT JOIN FF_SEC_COVERAGE c ON c.fsym_id = s.fsym_id
                LEFT JOIN sym_coverage sc ON sc.fsym_id = s.fsym_id
                WHERE s.ticker_region LIKE "%-US" AND s.ticker_region NOT LIKE '%.%' AND c.CURRENCY = "USD"
                AND (sc.fref_listing_exchange = "NAS" OR sc.fref_listing_exchange = "NYS")"""
    df = spark.sql(query)
    df = df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
    ticker_list = [row.ticker_region for row in df.collect()]
    return ticker_list

# df = get_ticker_df('imploded_stocks.csv')
# t_list = get_all_stocks()


    

+-------------+------------+
|P_DIVS_EXDATE|   P_DIVS_PD|
+-------------+------------+
|   2009-02-11|0.0649999976|
|   2009-03-13|0.0649999976|
|   2009-04-14|0.0649999976|
|   2009-05-13|0.0649999976|
|   2009-06-12|0.0649999976|
|   2009-07-15|0.0649999976|
|   2009-08-13|0.0649999976|
|   2009-09-14|0.0649999976|
|   2009-10-14|0.0649999976|
|   2009-11-12|0.0649999976|
+-------------+------------+
only showing top 10 rows

+-------------+---------+
|P_DIVS_EXDATE|P_DIVS_PD|
+-------------+---------+
+-------------+---------+

+-------------+---------+
|P_DIVS_EXDATE|P_DIVS_PD|
+-------------+---------+
|   2009-06-15|     1.75|
+-------------+---------+

+-------------+-----------+
|P_DIVS_EXDATE|  P_DIVS_PD|
+-------------+-----------+
|   2009-03-09|0.189999998|
|   2009-05-28|0.189999998|
|   2009-08-28|0.189999998|
|   2009-11-27|0.189999998|
|   2010-03-05|0.209999993|
|   2010-05-27|0.104999997|
|   2010-08-30|0.104999997|
|   2010-11-29|0.104999997|
|   2011-03-01|0.1099999

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
def get_top_bottom_five(df):
    df = df.sort_values(by='Value')
    df=df.dropna()
    top5 = df.head(10)
    down5 = df.tail(10)
    print(top5)
    print(down5)
    print(len(df))


df = pd.read_csv('Avg_Correlations.csv', index_col=None)
get_top_bottom_five(df)
#YOU'VE DONE WORST CHANGES NOW FIND OUT WHICH ONES DECREASE CONSISTENTLY
#ALSO FIGURE OUT MEANS BEFORE PERIOD AND AFTER PERIOD USING QUARTERLY AND COMPARE DIFF

In [None]:
metric_df

In [None]:
metric_dict