In [1]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("List available databases and tables").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session

spark.catalog.listDatabases()

[Database(name='2022_10_22', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='2023_04_01', catalog='spark_catalog', description='FactSet data version for the day', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse'),
 Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='hdfs://bialobog.cs.ucl.ac.uk:8020/user/hive/warehouse')]

In [2]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import when

spark.sql("USE 2023_04_01")
    # Assuming that 'ticker' is a valid Python variable

# query = f"""SELECT ticker_region FROM sym_ticker_region WHERE ticker_region LIKE "%-US" """
# df = spark.sql(query)
# df = df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
# ticker_list = df.collect()
# print(len(ticker_list))


DataFrame[]

In [139]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import when
import pyspark.pandas as ps
import numpy as np
from scipy.stats import zscore


#fund_df = fund_df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))

def query(ticker):
    query = f"""SELECT d.ticker_region, a.date
                FROM FF_ADVANCED_AF a 
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id 
                WHERE d.ticker_region = "{ticker}-US"
                ORDER BY a.date
                """

    fund_df = spark.sql(query)
    fund_df = fund_df.withColumn("ticker_region", regexp_replace("ticker_region", "-US$", ""))
    
    return fund_df

def get_top_bottom_ten(df):
    df = df.sort_values(by='Value')
    df=df.dropna()
    top10 = df.head(10)
    down10 = df.tail(10)
    print(top10,down10)
    return top10['Metric'].tolist(), down10['Metric'].tolist()

def pct_change_df(df, big_string, big_string2):
    df=spark.createDataFrame(df)
    df.createOrReplaceTempView("temp_table")
    
    query1 = f"""
                SELECT {big_string}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF a ON s.fsym_id = a.fsym_id AND YEAR(a.date) = YEAR(t.Implosion_Date)
                ORDER BY t.Ticker
            """
    query2 = f"""
                SELECT {big_string2}
                FROM temp_table t  
                LEFT JOIN sym_ticker_region s ON t.Ticker = SUBSTRING(s.ticker_region, 1, LENGTH(s.ticker_region)-3) AND SUBSTRING(s.ticker_region, -2, 3) = 'US'
                LEFT JOIN FF_ADVANCED_AF b ON s.fsym_id = b.fsym_id AND YEAR(b.date) = YEAR(t.Implosion_Prev4Years)
                ORDER BY t.Ticker
            """
    df1 = spark.sql(query1)
    df2 = spark.sql(query2)
    
    df1 = df1.toPandas()
    df2 = df2.toPandas()
    
    non_string_columns = df1.select_dtypes(exclude=['object']).columns
    df1 = df1[non_string_columns]
    df2 = df2[non_string_columns]
    
    null_threshold = 200
    columns_to_drop = df1.columns[df1.isnull().sum() > null_threshold]
    df1 = df1.drop(columns=columns_to_drop)
    df2 = df2.drop(columns=columns_to_drop)
    # print("NULLS:")
    # print(df1.isnull().sum())
    # print(df2.isnull().sum())
    
    percentage_change_df = ((df1 - df2) / df2) * 100
    #print(percentage_change_df)
    #print("LENGTH: ",len(percentage_change_df))
    
    
    metric_dict = {}
    for column in percentage_change_df.columns:
        percentage_change_df[column] = percentage_change_df[column].replace([np.inf, -np.inf], np.nan)
        new_col = percentage_change_df[column].dropna()
        mean_val = new_col.mean()
        stddev_val = new_col.std()
        z_score_threshold = 3.0
        new_col = new_col[(new_col >= mean_val - z_score_threshold * stddev_val) &
        (new_col <= mean_val + z_score_threshold * stddev_val)]
        #if new_col.std() < 1000:
        metric_dict[column] = new_col.mean()
    #print(metric_dict)
    metric_df = pd.DataFrame(list(metric_dict.items()), columns=['Metric', 'Value'])
    #metric_df.to_csv('ChangesBeforeImplosionA4yrs.csv', index=False)
    return metric_df
    
    # df['pct_change'] = (df[metric_curr] - df[metric_prev])/df[metric_prev]
    # df['pct_change'] = df['pct_change'].replace([np.inf, -np.inf], np.nan) 
    # df=df.dropna(axis=0)
    # mean_val = df['pct_change'].mean()
    # stddev_val = df['pct_change'].std()
    # z_score_threshold = 3.0
    # df = df[
    # (df['pct_change'] >= mean_val - z_score_threshold * stddev_val) &
    # (df['pct_change'] <= mean_val + z_score_threshold * stddev_val)]
    # new_mean = df['pct_change'].mean()
    # return new_mean




def get_metric_changes(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date']).dt.date
    df_metrics = spark.sql("SELECT * FROM FF_ADVANCED_AF LIMIT 10")
    df_metrics = df_metrics.columns
    result_string = ', '.join('a.' + item for item in df_metrics)
    result_string2 = ', '.join('b.' + item for item in df_metrics)
    top10s = []
    bottom10s = []
    for y in range(1,5):
        df['Implosion_Prev4Years'] = df['Implosion_Date'] - pd.DateOffset(years=y)
    #print(result_string)
        new_df = pct_change_df(df, result_string, result_string2)
        top10, bottom10 = get_top_bottom_ten(new_df)
        top10s.append(top10)
    intersection1 = sorted(list(set(top10s[0]).intersection(top10s[1])))
    print(intersection1)
    intersection1 = sorted(list(set(top10s[0]).intersection(top10s[2])))
    print(intersection1)
    intersection1 = sorted(list(set(top10s[0]).intersection(top10s[3])))
    print(intersection1)
    intersection1 = sorted(list(set(top10s[1]).intersection(top10s[2])))
    print(intersection1)
    intersection1 = sorted(list(set(top10s[1]).intersection(top10s[3])))
    print(intersection1)
    intersection1 = sorted(list(set(top10s[2]).intersection(top10s[3])))
    print(intersection1)

#get_metric_changes('imploded_tickers_dates.csv')




In [None]:
def consistent_changes(ticker, big_string):
    start_date = pd.to_datetime("2009-01-01")
    query1 = f"""
                SELECT FF_PRICE_CLOSE_FP, {big_string}
                FROM FF_BASIC_QF a
                LEFT JOIN sym_ticker_region d ON d.fsym_id = a.fsym_id 
                WHERE d.ticker_region = "{ticker}-US" 
                AND a.date >= "{start_date}"
                ORDER BY a.date
            """
    q_df = spark.sql(query1)
    q_df = ps.DataFrame(q_df)
    non_string_columns = q_df.select_dtypes(exclude=['object']).columns
    q_df = q_df[non_string_columns]
    # null_threshold = 10
    # columns_to_drop = q_df.columns[q_df.isnull().sum() > null_threshold]
    # q_df = q_df.drop(columns=columns_to_drop)
    correlations = q_df.corr()['FF_PRICE_CLOSE_FP']
    print(correlations)
    return correlations

def corr_analysis(filename):
    df = pd.read_csv(filename, index_col=False)
    df['Implosion_Date'] = pd.to_datetime(df['Implosion_Date']).dt.date
    df_metrics = spark.sql("SELECT * FROM FF_BASIC_QF LIMIT 10")
    df_metrics = df_metrics.columns[:10]
    result_string = ', '.join('a.' + item for item in df_metrics)
    result_string2 = ', '.join('b.' + item for item in df_metrics)
    print(len(df_metrics))
    corr_matrices = []
    for t in df['Ticker'].unique().tolist():
        corr_matrices.append(consistent_changes(t, result_string))
    print(corr_matrices)
    
corr_analysis('imploded_tickers_dates.csv')

10
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>
<class 'pyspark.pandas.series.Series'>


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [116]:
def get_top_bottom_five(df):
    df = df.sort_values(by='Value')
    df=df.dropna()
    top5 = df.head(10)
    down5 = df.tail(10)
    print(top5)
    print(down5)
    print(len(df))


df = pd.read_csv('ChangesBeforeImplosionA4yrs.csv')
get_top_bottom_five(df)
#YOU'VE DONE WORST CHANGES NOW FIND OUT WHICH ONES DECREASE CONSISTENTLY
#ALSO FIGURE OUT MEANS BEFORE PERIOD AND AFTER PERIOD USING QUARTERLY AND COMPARE DIFF

               Metric       Value
12      ff_rsrv_noneq -100.000000
21    ff_misc_net_oth  -98.798886
13     ff_adj_net_oth  -98.798886
19     ff_loan_chg_cf  -92.649567
17     ff_restate_ind  -84.210526
0     ff_oper_exp_oth  -77.852357
1       ff_eq_aff_inc  -52.947766
9   ff_price_close_fp  -43.754766
15  ff_div_pay_out_ps  -39.184313
2          ff_div_pfd  -36.114092
                   Metric       Value
8                  ff_dps  -22.186697
4            ff_prov_risk  -21.725654
5         ff_debt_lt_conv  -15.283030
11          ff_invest_aff  -11.906867
20             ff_dps_all   -5.756437
10      ff_fy_length_days   -0.055046
18                 ff_fyr    0.195667
7               ff_div_cf   62.467390
6   ff_sale_assets_bus_cf  131.847897
3        ff_bk_invest_tot  211.896266
22


In [None]:
metric_df

In [None]:
metric_dict