In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col,isnan, when, count
import pyspark.sql.functions as F

from configparser import ConfigParser

config = ConfigParser()
# create your own config.ini in root of project folder to store project configurations
config.read('config.ini')

pathfile = config.get('main', 'dirty_csv')  

spark = SparkSession.builder \
    .config("spark.driver.memory", "15g") \
    .appName("SparkFlight").getOrCreate()



In [2]:
# read preprocessed data (15mil rows)
preproc_data = spark.read.csv('preprocessed_data.csv', inferSchema='true', header='true', mode='PERMISSIVE', encoding='ISO-8859-1')

In [3]:
from math import factorial
from itertools import combinations, product

def gen_col_combs(col_names, max_lhs):
    col_combs = {}
    for i in range(1, max_lhs+1):
#       for each diff lhs size, store col combs in dict
        col_combs[f'lhs_{i}'] = []
        combs = []
#       if left hand side = 1, A=>B equals B=>A for the contingency table
#       so just combinations of 2 needed
        if i == 1:
            col_combs[f'lhs_{i}'] = list(combinations(col_names, i+1))
        else:
#           for left hand side > 1
#           make combinations of lhs and cartesian product these with all column names on rhs - column names that occur in lhs (trivial)
            combs = combinations(col_names, i)
            for comb in combs:
                for col in col_names:
                    if col not in comb:
                        col_combs[f'lhs_{i}'].append(comb + (col,))
    return col_combs

def gen_contin_cells(x, lhs_size):
    combs = []
    for comb in broadcast_col_names.value[f'lhs_{lhs_size}']:
# rows in form: (((lhs, rhs), (value(s) lhs, value rhs)), count)
        combs.append(((comb, tuple(x[i] for i in comb)),1))
    return combs

def calc_combinations(x):
#     combinations of 2
    return factorial(x)/(factorial(2)*factorial(x-2)) if x >= 2 else 0

def map_value_to_combs_count(rdd_row):
    lhs_plus_rhs = rdd_row[0][0]
#     value of rhs is always last item. Strip it to only depend on value of lhs in key
    values_lhs_tup = rdd_row[0][1][:-1]
    reduced_count_rows = rdd_row[1]
    return ((lhs_plus_rhs, values_lhs_tup), calc_combinations(reduced_count_rows))
        
def calc_percentage(x, y):
#     normalize to percentage value in [0, 1]
    result = y/x if x > y else x/y
#     if normalized percentage value = 1 then set to 1.5 instead so this value cannot be confused
#     with a possible total 2 comb count of 1 for lhs
#     as total combination count always has to be integer
    result = 1.5 if result == 1 else result 
    return result

def map_total_comb_to_zero_perc(rdd_row):
    lhs_plus_rhs = rdd_row[0][0] 
    value = rdd_row[1]
#     rdd_row could be row with total 2 comb count per lhs as value that did not get normalized to [0, 1] range as percentage
#     as all rows for that lhs had unique rhs values so there was no row with possible combinations of 2 equal lhs + rhs per lhs
#     to reduce with. Rows with unique lhs + rhs were filtered in step #3. 
#     lhs that has no possible combinations of 2 equal lhs + rhs per lhs will be set to 0%
#     otherwise rdd_row has normalized percentage as value but in the case of percentage being 100% value = 1.5 instead of 1.
    if value >= 1 and value != 1.5:
        value = 0
    return (lhs_plus_rhs, value)
    
def rem_rhs_value_from_key(rdd_row):
    lhs_plus_rhs = rdd_row[0][0]
#     value of rhs is always last item. Strip it to only depend on value of lhs in key
    values_lhs_tup = rdd_row[0][1][:-1]
    value = rdd_row[1]
    return ((lhs_plus_rhs, values_lhs_tup), value)
    
        

In [4]:
# #  get a sample. This is just for testing locally. Should be whole preprocessed dataset.
# len_sample = 16_000
# len_preproc_data = 15_000_000
# cont_sample_data = preproc_data.sample(True, fraction=len_sample/len_preproc_data).cache()

In [None]:
max_lhs = 3
col_combs = gen_col_combs(preproc_data.columns, max_lhs)
# broadcast column combinations for efficient copying of immutable column combs list to nodes
broadcast_col_names = spark.sparkContext.broadcast(col_combs)
perc_threshold = 0.7

for i in range(1, max_lhs+1):
# #    sample only for local use to test
#     flat_columns = cont_sample_data.rdd.flatMap(lambda x: gen_contin_cells(x, lhs_size=i)) # 1
#     create all column combs per row
    flat_columns = preproc_data.rdd.flatMap(lambda x: gen_contin_cells(x, lhs_size=i)) # 1
#     cache or not?? only used one time extra this rdd later? check if this actually wins time
    c_flat_columns = flat_columns.reduceByKey(lambda x,y: x+y).cache() # 2
#     unique lhs + rhs rows are not needed to calculate possible combinations of 2 rows with equal lhs.
    f_c_flat_columns = c_flat_columns.filter(lambda x: x[1] >= 2) # 3
#     map identical lhs + rhs occurences to possible combs of 2
    calc_combs = f_c_flat_columns.map(lambda x: map_value_to_combs_count(x)) # 4
#     reduce amount of possible combinations of 2 equal lhs + rhs per lhs
    reduce_combs_by_lhs = calc_combs.reduceByKey(lambda x,y: x+y) # 5
    
#     make use of already cached reduced rdd from step #2
#     make sure to only reduce on lhs as we want total 2 comb count for lhs
    row_c_for_lhs = c_flat_columns.map(lambda x: rem_rhs_value_from_key(x))
    red_c_for_lhs = row_c_for_lhs.reduceByKey(lambda x,y: x+y)
#     Filter out rows with unique lhs as they cannot match with another equal lhs row
    filt_red_c_for_lhs = red_c_for_lhs.filter(lambda x: x[1] >= 2)
#     calculate total number of 2 row combs per lhs
    map_total_combs_lhs = filt_red_c_for_lhs.map(lambda x: map_value_to_combs_count(x))
    
#     now union the per (lhs, rhs) comb count rdd and per (lhs) comb count rdd
    total_and_eq_combs = reduce_combs_by_lhs.union(reduce_combs_by_lhs)# 8
#     calc percentage per lhs per FD
#     this could be bottleneck as now every key only has 2 rows so low chance of being able to combine locally?
    percentage_per_lhs = total_and_eq_combs.reduceByKey(lambda x,y: calc_percentage(x,y)) # 9
    
    mapped_percentages = percentage_per_lhs.map(lambda x: map_total_comb_to_zero_perc(x))
#     reduce by value (percentage) per lhs. Keep separate count to calculate the mean over the percentages of one FD combination (mapValues).
    means_percentages = mapped_percentages.mapValues(lambda value: (value, 1)) \
                                            .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])) \
                                            .mapValues(lambda value: value[0]/value[1])
    
    filter_threshold_fds = means_percentages.filter(lambda x: x[1] >= perc_threshold)
#     map to only Soft FD comb tuple
    final_soft_fds = filter_threshold_fds.map(lambda x: x[0])
#     form: [("lhs column", "lhs column", "rhs column"), ....] depending on value of i in loop for amount of lhs columns
    soft_fd_comb_list = final_soft_fds.collect()
    
# for testing locally
#     if i == 1:
#         print(len(soft_fd_comb_list))
#         print(soft_fd_comb_list)
#         break
