In [1]:
from pyspark.ml.feature import (RegexTokenizer, Tokenizer, HashingTF, IDF,
                                StopWordsRemover, CountVectorizer, StopWordsRemover, StringIndexer, OneHotEncoder)
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                  MulticlassClassificationEvaluator)
from pyspark.sql.types import (LongType ,StringType, IntegerType,
                               FloatType, DoubleType, ArrayType)
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split
from pyspark.ml.clustering import LDA
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import countDistinct
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import HiveContext
from pyspark.sql import functions as F

# Full Document

In [3]:
#Full text documents
df = spark.sql("SELECT * FROM taxiarchis.fulldocument_4")
df = df.dropna()
df.count()

# Sections for each document

In [5]:
#Per Section Documents
df1 = spark.sql("SELECT * FROM taxiarchis.persection_4_1")
df1 = df1.dropna()
df1.count()

# Oracle Predictions

In [7]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score


per_cla = {'Classes':['all','0','1','2','3']
  
}

cate = [0.0, 1.0, 2.0, 3.0]
names = [ "nb", 'lr','ovr',"dtc",'rfc']




def take_f1_scores(true, predict):
    '''
    Return the weighted-average f1-score
    '''
    return f1_score(true, predict, average="weighted")  
  



def oracle(label,prediction, list_predictions):
    '''
    If there are a least one correct prediction among the sections,
    replace it with as True Positive fdocument's class
    '''
    if label in list_predictions:
        return int(label)
    else:
        return int(prediction)

def oracle predictions(method_combination):
    '''
    Explore the posibilities of the section-based methodology
    by spesifying the upperbounds of this approach.
    '''

    for algo in names:
    print(str(algo))

    #Finds the saved tables with th predictions of each algorithm
    #SQL query for retreiving the saved tables in Databricks
    s = 'SELECT * FROM taxiarchis.'+algo+method_combination
    test_results = spark.sql(s)

    #Udf function for calling the python function in pyspark code 
    oracle_prediction = udf(oracle, IntegerType())

    #call udf wrap function
    oracle_df = test_results.withColumn('orc_predict',oracle_prediction(col("label"), col("prediction"), col("list_categ"))) 

    pred = []

    colle = oracle_df.select(['label', 'orc_predict']).collect()

    tru_value = [int(i.label) for i in colle]#Collect the true label in a list 
    predict_value = [int(i.orc_predict) for i in colle] #Collect the  prediction in a list 

    print(round(take_f1_scores(tru_value , predict_value ),4)*100)

    pred.append(round(take_f1_scores(tru_value , predict_value ),4)*100)


    #for  each class
    for i in cate:

        print( 'Category {} ......'.format(i))

        pecific_class = oracle_df.filter((oracle_df.label == i))

        labelPredict = pecific_class.select(['label', 'orc_predict']).collect()

        tru_value = [int(i.label) for i in labelPredict]#Collect the true label in a list 
        predict_value = [int(i.orc_predict) for i in labelPredict] #Collect the  prediction in a list

        pred.append(round(take_f1_scores(tru_value , predict_value ),4)*100)


    per_cla[algo] = pred  

    return per_cla



In [8]:
#Call Oracle Function  

classes_oracle = oracle predictions('_tfidf_simplemajority')  
# classes_oracle = oracle predictions('_lda_simplemajority') #UNCOMMENT IF YOU WANT TO FIND THE ORACLE UPPER BOUND FOR LDA MODELLING

pd.DataFrame(classes_oracle).set_index('Classes')

## TP-FP Analysis Based on the Average document length and the Average number of sections

In [10]:
from pyspark.sql.functions import count

#average number of sections
doc_numSect = df1.groupby('DOI').agg(count(df1.Section).alias('number_sections'))



int_category = StringIndexer(inputCol='Categories',outputCol='label')
regexTokenizer = RegexTokenizer(inputCol="Full_Document", outputCol="tokens", pattern="\\W")
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

prep_pipeline = Pipeline(stages = [int_category, regexTokenizer, remover])
pre_processing = prep_pipeline.fit(df)
pre_processed_data = pre_processing.transform(df)


#create UDF function
def document_length(doc_vec):
    return int(len(doc_vec))


length_f= udf(document_length, IntegerType())

final_prediction=pre_processed_data.withColumn('Document_length',length_f(pre_processed_data.filtered))
#Average document length
doc_length  =final_prediction.select(['DOI','label',"Document_length"])

#Join both features, namely average length and average number of sections
final_features = doc_length.join(doc_numSect , on=['DOI'], how='inner').dropDuplicates().dropna()
final_features = final_features.select(['DOI','Document_length', 'number_sections'])
display(final_features)

## Analysis TP - FP

In [12]:
from pyspark.sql.functions import mean,avg

cate = [0.0, 1.0, 2.0, 3.0]
d = {"Features":['TP Avg Length 0', 'TP Avg Sections 0', 'FP Avg Length 0', 'FP Avg Sections 0',\
 'TP Avg Length 1', 'TP Avg Sections 1', 'FP Avg Length 1', 'FP Avg Sections 1',\
 'TP Avg Length 2', 'TP Avg Sections 2', 'FP Avg Length 2', 'FP Avg Sections 2',\
 'TP Avg Length 3', 'TP Avg Sections 3', 'FP Avg Length 3', 'FP Avg Sections 3',\
]
    }
names = [ "nb", 'lr','ovr',"dtc",'rfc']
# names = [ "nb", 'lr',"dtc",'rfc']

def tp_fp_analysis(method_combination):
  '''
  Analysis of ture positives false positives
  based on the average document length and
  average number of sections
  '''



  for alg in names:
    print(str(alg))
    feat = []
    
    #SQL query for retreiving the saved tables in Databricks
    s = 'SELECT * FROM taxiarchis.'+ alg + method_combination
    test_results = spark.sql(s)
    test = test_results.select(['DOI', 'label','prediction'])


    for i in cate:
        print( 'Category {} ......'.format(i))

        TP = test.filter((test.label == i) & (test.prediction == i))
        FP = test.filter((test.prediction == i) & (test.label != i))

        #True Positives
        Tp_t = final_features.join(TP.select('DOI') , on=['DOI'], how='inner').dropDuplicates().dropna()

        #False Positives
        Tp_f = final_features.join(FP.select('DOI') , on=['DOI'], how='inner').dropDuplicates().dropna()

        # True Positives
        try:
        feat.append( int(Tp_t.select(avg(col('Document_length'))).collect()[0][0])) # Length text TP
        except:
        feat.append(float('nan'))

        try:
        feat.append(  int(Tp_t.select(avg(col('number_sections'))).collect()[0][0])) # Number of sections TP
        except:
        feat.append(float('nan'))

        #False Positives
        try:
        feat.append( int(Tp_f.select(avg(col('Document_length'))).collect()[0][0]))# Length text FP
        except:
        feat.append(float('nan'))

        try:
        feat.append(  int(Tp_f.select(avg(col('number_sections'))).collect()[0][0]))# Number of sections FP
        except:
        feat.append(float('nan'))

    #Append dictionary
    d[alg] = feat
    
    return d

In [13]:
tp_fn = tp_fp_analysis('_tfidf_full')

#UNCOMMENT ANY OF THE FOLLOWING TO TEST THE REST OF THE METHODS AND TEXT MODELLINGS
# tp_fn = tp_fp_analysis('_lda_full')
# tp_fn = tp_fp_analysis('_tfidf_simplemajority')
# tp_fn = tp_fp_analysis('_lda_simplecmajority')
# tp_fn = tp_fp_analysis('_tfidf_probabilisticmajority')
# tp_fn = tp_fp_analysis('_lda_probabilisticmajority')

pd.DataFrame(classes_oracle).set_index('Features')

## Common predicted documents (UNION) between the methods

In [15]:
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score



def find_common_docs(first_method, second_method, simpleMajority =True):
    '''
    Compares two methods by creating a confusion matrix with the common
    TP, FP, FN for each algorithms in the compared methods.

    simpleMajority: Set False if one of the methods is the probabilistic majority,
                  else by default is True.
    '''

    comparison_matices = {}

    if simpleMajority:
    names = [ "nb", 'lr','ovr',"dtc",'rfc']
    else:
    names = [ "nb", 'lr',"dtc",'rfc']

    for algo in names:
        print(str(algo))

        #Compared methods - SQL queries for retreiving the saved tables in Databricks
        first_representation = 'SELECT * FROM taxiarchis.'+ algo+ first_method
        second_representation = 'SELECT * FROM taxiarchis.'+ algo+ second_method

        first_df = spark.sql(first_representation)
        second_df = spark.sql(second_representation)



        per_cla = {'Classes':['0','1','2','3'] 
        }

        cate = [0.0, 1.0, 2.0, 3.0]
        cate2 = [0.0, 1.0, 2.0, 3.0]

        for i in cate:
            print(i)
            temp = []
            for j in cate2:


                first_filtered_df = first_df.filter((first_df.label == j) & (first_df.prediction == i))
                second_filtered_df =  second_df.filter(( second_df.label == j) & ( second_df.prediction == i))

                #Union- find the common documents of the predictions based on the common DOIs
                compare_df = first_filtered_df.join(second_filtered_df.select(['DOI']) , on=['DOI'], how='inner').dropDuplicates().dropna()
                temp.append(compare_df.count())

            per_cla[str(i)] = temp

        #Append for each algorithm the matrix
        dtaframe = pd.DataFrame(per_cla).set_index('Classes')

        comparison_matices[algo] =  dtaframe
    
    return comparison_matices

In [16]:
union_confMatrix =  find_common_docs('_tfidf_full', '_tfidf_simplemajority')

#UNCOMMENT IF YOU WANT TO TEST MORE COMPARISONS

# union_confMatrix =  find_common_docs('_lda_full', '_lda_simplemajority') 
# union_confMatrix =  find_common_docs('_tfidf_full', '_tfidf_probabilisticmajority',False)
# union_confMatrix =  find_common_docs('_lda_full', '_lda_probabilisticmajority',False)
# union_confMatrix =  find_common_docs('_lda_simplemajority', '_lda_probabilisticmajority', False)                                     
# union_confMatrix =  find_common_docs('_tfidf_simplemajority', '_tfidf_probabilisticmajority',False)   


dict_algo={'nb':'Multinomial Naive Bayes',
           'lr':'Multinomial Logistic Regression',
           'dtc':'Decision Tree Classifier',
           'rfc':'Random Forest Classifier',
           'ovr':'One vs Rest Linear SVM'}

#Print all the matrices in latex format
for i  in  dict_algo.keys():
    print(str(dict_algo[i]))
    print(comparison_matices[i].to_latex())
