In [1]:
from pyspark.ml.feature import (RegexTokenizer, Tokenizer, HashingTF, IDF,
                                StopWordsRemover, CountVectorizer, StopWordsRemover, StringIndexer, OneHotEncoder)
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                  MulticlassClassificationEvaluator)
from pyspark.sql.types import (LongType ,StringType, IntegerType,
                               FloatType, DoubleType, ArrayType)
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split
from pyspark.ml.clustering import LDA
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import countDistinct
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import HiveContext
from pyspark.sql import functions as F



This notebook is consists of pyspark code. It is necessary first to be uploaded on Databricks as well all the appropriate tables to have been created. For creating all the appropriate tables for this notebook from the stored csv in the S3 bucket use the **Create_appropriate_tables.ipynb** notebook in Databricks. The path of the csvs in the Elseviers S3 bucket is  **"---------------------------" #Path has removed because of confidentiality reasons**.

## Load DOIs and Datasets

In [3]:
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split

#Load the Mapping Dataframe for the 4 Categories 
df_doiCateg = spark.sql("SELECT * FROM taxiarchis.doi_categories_4")
indexer = StringIndexer(inputCol="Categories", outputCol="label")

#FOR ENSEMBLE EXPERIMENT ONLY!!!
# df_doiCateg = spark.sql("SELECT * FROM taxiarchis.doi_categories_4_4_features")
# indexer = StringIndexer(inputCol="Category", outputCol="label")


indexed = indexer.fit(df_doiCateg).transform(df_doiCateg)

#transfrom to pandas
df_cat = indexed.toPandas()

#Split the DOIs for training and Test ----- randomstate =42
X_train, X_test, y_train, y_test = train_test_split(df_cat['DOI'],df_cat['label'] , test_size=0.3, random_state=42)


print(len(X_train))
print(len(X_test))


In [4]:
#FULL DOCUMENT ----- UNCOMMENT AND COMMENT OUT IF YOU WANT TO USE THE REST

# df = spark.sql("SELECT * FROM taxiarchis.fulldocument_4")
# df = df.dropna()
# df.count()


#PER SECTION DOCUMENTS ----- UNCOMMENT AND COMMENT OUT IF YOU WANT TO USE THE REST

df = spark.sql("SELECT * FROM taxiarchis.persection_4_1")
df = df.dropna()
df.count()

#FOR ENSEMBLE ----- UNCOMMENT AND COMMENT OUT IF YOU WANT TO USE THE REST

# df = spark.sql("SELECT * FROM taxiarchis.section_4_4_ensemble").dropna()
# df = df.dropna()
# df.count()


In [5]:
display(df)

## Term frequency - inverse term frequency (TF-IDF)

In [7]:
def tfidf_fullDocument(df):
    '''
    Vectorize full documents as a
    tf-idf numerical text representation
    '''
    regexTokenizer = RegexTokenizer(inputCol="Full_Document", outputCol="tokens", pattern="\\W") #Tokenize
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered') #Remove stop words
    count_vec = CountVectorizer(inputCol='filtered', outputCol='count_vec') #Create Bag-of-word (TF)
    idf = IDF(inputCol='count_vec', outputCol='features')#Weight vetor (idf)


    int_category = StringIndexer(inputCol='Categories',outputCol='label')
    int_sections = StringIndexer(inputCol = 'DOI', outputCol ='uni_id')

    #Create a pipeline
    prep_pipeline = Pipeline(stages = [int_category, int_sections, regexTokenizer, remover, count_vec, idf])
    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    return pre_processed_data.select(['Categories','uni_id','DOI','label','features'])


# final_data = tfidf_fullDocument(df)
# final_data.count()

In [8]:
def prepare_tfidf_section(df):
      
    '''
    Vectorize sections in tf-idf text
    Representation.
    '''

    #FOR THE FULL DOCUMENT NEED TO BE CHANGED!!

    regexTokenizer = RegexTokenizer(inputCol="Section", outputCol="tokens", pattern="\\W")#Tokenize
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered') #Remove stop words
    count_vec = CountVectorizer(inputCol='filtered', outputCol='count_vec') #Create Bag-of-word (TF)
    idf = IDF(inputCol='count_vec', outputCol='features',minDocFreq=2) #Weight vetor (idf)


    int_category = StringIndexer(inputCol='Category',outputCol='label')
    int_sections = StringIndexer(inputCol = 'doc_id', outputCol ='uni_sec_id') 

    #Create a pipeline
    prep_pipeline = Pipeline(stages = [int_category, int_sections, regexTokenizer, remover, count_vec, idf])
    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    return pre_processed_data.select(['doc_id','uni_sec_id','DOI','label','features'])


## Latent Dirichlet allocation (LDA)

In [10]:
def LDA_fullDocument(df,numTopics = 30):
  
    '''
    Vectorize full document's text by representing them 
    with lda features. The function returns a dataframe.
    '''
    int_category = StringIndexer(inputCol='Categories',outputCol='label')
    int_id = StringIndexer(inputCol='DOI',outputCol='id') #map each unique document with an id
    regexTokenizer = RegexTokenizer(inputCol="Full_Document", outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='features')

    #PIPELINE
    prep_pipeline = Pipeline(stages = [int_category, int_id, regexTokenizer, remover, count_vec])
    pre_processing = prep_pipeline.fit(df)
    final_data_LDA = pre_processing.transform(df)

    final_data_LDA = final_data_LDA.dropDuplicates(['id'])
    final_data_LDA = final_data_LDA.select(['DOI','id','features','Full_Document','Categories','label'])


    # Trains a LDA model.
    lda = LDA(k=numTopics, maxIter=50, optimizer="em")
    model = lda.fit(final_data_LDA.select('id','features'))


    #Create LDA features
    transformed = model.transform(final_data_LDA)
    
    return transformed.select(['DOI','label','topicDistribution']).withColumnRenamed("topicDistribution", "features")

# final_data = LDA_fullDocument(df)
# final_data.count()

In [11]:
display(final_data)

In [12]:
def lda_representation_sections(df,numTopics = 2):
  
    '''
    Vectorize documents of sections by representing them 
    in lda text representation. The function returns a dataframe.
    '''
  
    #FOR THE FULL DOCUMENT NEED TO BE CHANGED!!
    regexTokenizer = RegexTokenizer(inputCol='Section', outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='features')

    int_category = StringIndexer(inputCol='Category',outputCol='label')
    int_id = StringIndexer(inputCol='doc_id',outputCol='id') #map each unique scetion with an unique id -Very important in order the lda to work
  

    #PIPELINE
    prep_pipeline = Pipeline(stages = [int_category, int_id, regexTokenizer, remover, count_vec])
    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    final_data_LDA = pre_processed_data
    final_data_LDA = final_data_LDA.dropDuplicates(['id'])
  
 
    # Train a LDA model.
    lda = LDA(k=numTopics, maxIter=50, optimizer="em")
    model = lda.fit(final_data_LDA.select('id','features'))


    #Create LDA features
    transformed = model.transform(final_data_LDA)
    
    return transformed.select(['DOI','label','topicDistribution']).withColumnRenamed("topicDistribution", "features")


## Evaluation Measures

In [14]:
def evaluation_measures(new_df_p):
    '''
    Predict all the evaluation Measures:
    Accuracy, F1-Score, Recall, Precision
    '''
    
    ev_m = []
    
    #ACCURACY
    acc_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName = 'accuracy')
    accuracy = acc_eval.evaluate(new_df_p)
    print("Accuracy = %g" % accuracy)
    
    return accuracy
#     ev_m.append(accuracy)
    
#     #F1-SCORE
#     acc_eval_f1 = MulticlassClassificationEvaluator(metricName = 'f1')
#     f1_score = acc_eval_f1.evaluate(new_df_p)
#     print("f1 = %g" % f1_score)
#     ev_m.append(f1_score)
    
#     #RECALL
#     acc_eval_recall = MulticlassClassificationEvaluator(metricName = 'weightedRecall')
#     recall = acc_eval_recall.evaluate(new_df_p)
#     print("weightedRecall = %g" % recall)
#     ev_m.append(recall)
    
#     #PRECISION
#     acc_eval_precission = MulticlassClassificationEvaluator(metricName = "weightedPrecision")
#     precission = acc_eval_precission.evaluate(new_df_p)
#     print("weightedPrecision = %g" % precission)
#     ev_m.append(precission)
    
#     return ev_m

In [15]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

def micro(true, predict):
  
  t = precision_recall_fscore_support(true, predict, average='micro')
  
  return (round(t[0],4)*100, round(t[1],4)*100, round(t[2],4)*100)
  
def macro(true, predict):
  
  t2 = precision_recall_fscore_support(true, predict, average='macro')
  
  return (round(t2[0],4)*100, round(t2[1],4)*100, round(t2[2],4)*100)

def weighted(true, predict):
  
  t3 = precision_recall_fscore_support(true, predict, average='weighted')
  
  return (round(t3[0],4)*100, round(t3[1],4)*100, round(t3[2],4)*100)

## Machine Learning Algorithms  - with their hyper-parameters tuned

In [17]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                      DecisionTreeClassifier)
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.classification import LogisticRegression

#Multinomial Naive Bayes
nb = NaiveBayes()

#Decision Tree - Random Forest 
dtc = DecisionTreeClassifier(maxDepth = 12)

#Random Forest
rfc = RandomForestClassifier(numTrees = 200, maxDepth = 15)

#Linear SVM - One vs All
lsvc = LinearSVC(maxIter=10, regParam=0.2)
ovr = OneVsRest(classifier=lsvc  )

#Logistic Regression
lr = LogisticRegression(maxIter=15, regParam=0.0, elasticNetParam=0.0 )



## Split Training - Test sets

In [19]:
def split_train_test(final_data):
    '''
    Splits data in training(70%) and test set(30%)
    '''
    #train
    train = final_data.filter(final_data.DOI.isin(list(X_train)))
    #test
    test = final_data.filter(final_data.DOI.isin(list(X_test)))
    
    return (train, test)


## Probabilistic Majority Vote

In [21]:
import numpy as np
import random 
from pyspark.sql.functions import udf, max
from pyspark.sql.types import LongType,StringType,IntegerType, FloatType,DoubleType
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


def find_maximum(test_results):
  
    '''
    Finds the maximum probability and 
    prepares the dataframe for the Majority Vote.
    '''
    udf_wf_var = udf(lambda x: round(np.max(np.array(x)),10), returnType=FloatType()) #Define UDF function
    df =  test_results.withColumn('WF_Var',udf_wf_var('probability')).dropna()

    return df.groupby("DOI").agg(F.collect_list('prediction').alias('list_categ'),F.collect_list('WF_Var').alias('prob_categ'))


def majority_vote_percent(pre,prob):
    '''
    Predict the majority vote based on the given 
    probabilities  of the classifier by
    returning randomly a class when we have more
    than one winner.
    '''
    
    maj_vote ={}

    for prdi, prba in zip(pre,prob):
        
        
        if prdi in maj_vote:
            maj_vote[prdi] += prba    
        else:
            maj_vote[prdi] = prba
        
    winners = []
    max_count = np.max(maj_vote.values())  
    for vote, count in maj_vote.items():
        if count == max_count:
            winners.append(vote)
    
    return random.choice(winners)
  
def max_probability_class(pre,prob):
      
    '''
    Returns the class with the maximum 
    section probability 
    '''
    max_pro = 0.0
    max_class = 0
    for prdi, prba in zip(pre,prob):
        if prba > max_pro:
            max_pro = prba
            max_class = prdi

    return max_class
  
    
def majority_final_prediction(test_results):

    '''
    Makes the final prediction based on the majority vote
    results of probabilities.
    '''
    #Find Maximum Probability and prepare dataframe for majority vote
    df_for_majority_vote = find_maximum(test_results)

    #create UDF function
    majority_function_probab = udf(majority_vote_percent, DoubleType())

    final_prediction=df_for_majority_vote.withColumn('prediction',majority_function_probab(df_for_majority_vote.list_categ, df_for_majority_vote.prob_categ))

    # Inner Merge with real Categories
    return final_prediction.join(indexed , on=['DOI'], how='inner').dropDuplicates().dropna()


## Simple Majority Vote

In [23]:
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql import functions as F
import numpy as np
import random 
from pyspark.sql.types import IntegerType,DoubleType
from pyspark.sql.functions import col


def prepare_rand_majority(df):
  
    '''
    Return a dataframe which contains
    in each row a document and a list of
    the predictions of the classifier prepared 
    for the majority vote.
    '''
    return  df.groupby("DOI").agg(F.collect_list("prediction").alias('list_categ'))


def majority_vote_random(votes):
    '''
    Predict the majority vote by
    returning randomly when we have more
    than one winner class.
    '''

    vote_counts = {}
    
    for vote in votes:
        if vote in vote_counts:
            vote_counts[vote] += 1
        else:
            vote_counts[vote] = 1
    winners = []
    max_count = np.max(vote_counts.values())
    
    for vote, count in vote_counts.items():
        if count == max_count:
            winners.append(vote)
            
    return random.choice(winners)

  
def final_prediction_maj_random(df):
    '''
    Computes the final predictions based
    on a simple majority vote.
    '''
    df_for_majority_vote = prepare_rand_majority(df)
    #udf apply function
    majority_function = udf(lambda z: majority_vote_random(z), DoubleType())
    f_res = df_for_majority_vote.withColumn('prediction', majority_function(col('list_categ')))
    return f_res.join(indexed , on=['DOI'], how='inner').dropDuplicates().dropna()



# All together- make predictions - Experiments

## Full Document

In [26]:
#TF-IDF REPRESENTATION
final_data =  tfidf_fullDocument(df)
train, test = split_train_test(final_data)

In [27]:
#LDA text REPRESENTATION----- UNCOMMENT AND COMMENT OUT TF-IDF IF YOU WANT TO USE LDA

# print('Train LDA Algorithm...')
# final_data = LDA_fullDocument(df, 50)
# print('LDA Algorithm has been trained.')
# train, test = split_train_test(final_data)


In [28]:
dict_algo={'nb':'Multinomial Naive Bayes',
           'lr':'Multinomial Logistic Regression',
           'dtc':'Decision Tree Classifier',
           'rfc':'Random Forest Classifier',
           'ovr':'One vs Rest Linear SVM'}

algorithms = [nb,lr,ovr,dtc,rfc]
names = [ "nb", 'lr','ovr',"dtc",'rfc']
measures = {'Measures':['Accuracy','Macro-Precision','Macro-Recall','Macro-FScore','Micro-Precision','Micro-Recall','Micro-FScore','Weighted-Precision','Weighted-Recall','Weighted-FScore']}


conf_matices = {}
predict_values = {}


for algo, n in zip(algorithms,names):
  
    print(dict_algo[n])
    #train
    paper_class_2 = algo.fit(train)
    test_results = paper_class_2.transform(test)

    #Write a table in databricks for each algorithm
    s = 'taxiarchis.'+n+'_lda_balancedfull'
    test_results.write.mode("overwrite").saveAsTable(s)

    colle = test_results.select(['label', 'prediction']).collect()

    tru_value = [int(i.label) for i in colle]#Collect the true label in a list 
    predict_value = [int(i.prediction) for i in colle] #Collect the  prediction in a list 

    #Compute evaluation scores
    meas = []
    meas.append(round(evaluation_measures(test_results),4)*100)
    meas.extend(macro(tru_value,predict_value))
    meas.extend(micro(tru_value,predict_value))
    meas.extend(weighted(tru_value,predict_value))

    measures[n] = meas
    print(meas)

    y_actu = pd.Series(tru_value, name='Actual')
    y_pred = pd.Series(predict_value, name='Predicted')
    df_confusion = pd.crosstab(y_actu, y_pred)  #Confusion Matrix

    conf_matices[n] = df_confusion


In [29]:
import pandas as pd

pd_df_fullDocument = pd.DataFrame(conf_matices['nb'])
pd_df_fullDocument=pd_df_fullDocument.set_index('Measures')
pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']]


pd_df_fullDocument

# print (pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']].to_latex())

In [30]:
#Print all the confusion matrices for each algorithm in a latex format
for i  in  dict_algo.keys():
    print(str(dict_algo[i]))
    print(conf_matices[i].to_latex())

## Section Based - Simple Majority Vote

In [32]:
#TF-IDF REPRESENTATION
final_data =  prepare_tfidf_section(df)
train, test = split_train_test(final_data)

In [33]:
#LDA REPRESENTATION ----- UNCOMMENT AND COMMENT OUT TF-IDF IF YOU WANT TO USE LDA

# print('Train LDA Algorithm...')
# final_data = lda_representation_sections(df, 50)
# print('LDA Algorithm has been trained.')

# train, test = split_train_test(final_data)

In [34]:
dict_algo={'nb':'Multinomial Naive Bayes',
           'lr':'Multinomial Logistic Regression',
           'dtc':'Decision Tree Classifier',
           'rfc':'Random Forest Classifier',
           'ovr':'One vs Rest Linear SVM'}

algorithms = [nb, lr, ovr, dtc,rfc]
names = [ "nb", 'lr','ovr', "dtc",'rfc']

measures = {'Measures':['Accuracy','Macro-Precision','Macro-Recall','Macro-FScore','Micro-Precision','Micro-Recall','Micro-FScore','Weighted-Precision','Weighted-Recall','Weighted-FScore']}

conf_matices = {}
predict_values = {}


for algo, n in zip(algorithms,names):
  
    print(dict_algo[n])
    #train
    paper_class_2 = algo.fit(train)
    test_results = paper_class_2.transform(test)
    new_df = final_prediction_maj_random(test_results)#Simple Majority Vote

    s = 'taxiarchis.'+n+'_tfidf_simpleMajority'
    #   s = 'taxiarchis.'+n+'_lda_simpleMajority'
    new_df.write.mode("overwrite").saveAsTable(s) 


    colle = new_df.select(['label', 'prediction']).collect()

    tru_value = [int(i.label) for i in colle]#Collect the true label in a list 
    predict_value = [int(i.prediction) for i in colle] #Collect the  prediction in a list 

    #Compute evaluation measures
    meas = []
    meas.append(round(evaluation_measures(new_df),4)*100)
    meas.extend(macro(tru_value,predict_value))
    meas.extend(micro(tru_value,predict_value))
    meas.extend(weighted(tru_value,predict_value))

    measures[n] = meas
    print(meas)

    y_actu = pd.Series(tru_value, name='Actual')
    y_pred = pd.Series(predict_value, name='Predicted')
    df_confusion = pd.crosstab(y_actu, y_pred)  #Confusion Matrix
    print(df_confusion)
    conf_matices[n] = df_confusion
  


In [35]:
import pandas as pd

pd_df_fullDocument = pd.DataFrame(conf_matices['nb'])
pd_df_fullDocument=pd_df_fullDocument.set_index('Measures')
pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']]


pd_df_fullDocument

# print (pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']].to_latex())

In [36]:
#Print all the confusion matrices for each algorithm in a latex format
for i  in  dict_algo.keys():
  print(str(dict_algo[i]))
  print(conf_matices[i].to_latex())

## Section Based - Probabilistic Majority Vote

In [38]:
#TF-IDF REPRESENTATION
final_data =  prepare_tfidf_section(df)
train, test = split_train_test(final_data)

In [39]:
#LDA REPRESENTATION ----- UNCOMMENT AND COMMENT OUT TF-IDF IF YOU WANT TO USE LDA

# print('Train LDA Algorithm...')
# final_data = lda_representation_sections(df, 50)
# print('LDA Algorithm has been trained.')

# train, test = split_train_test(final_data)

In [40]:
dict_algo={'nb':'Multinomial Naive Bayes',
           'lr':'Multinomial Logistic Regression',
           'dtc':'Decision Tree Classifier',
           'rfc':'Random Forest Classifier',
           'ovr':'One vs Rest Linear SVM'}

algorithms = [nb, lr, ovr, dtc,rfc]
names = [ "nb", 'lr','ovr', "dtc",'rfc']

measures = {'Measures':['Accuracy','Macro-Precision','Macro-Recall','Macro-FScore','Micro-Precision','Micro-Recall','Micro-FScore','Weighted-Precision','Weighted-Recall','Weighted-FScore']}

conf_matices = {}
predict_values = {}


for algo, n in zip(algorithms,names):
  
    print(dict_algo[n])
    #train
    paper_class_2 = algo.fit(train)
    test_results = paper_class_2.transform(test)
    new_df = majority_final_prediction(test_results) #Majority Vote Based on the probabilities given by the classifier

    s = 'taxiarchis.'+n+'_tfidf_probabilisticMajority'
    #   s = 'taxiarchis.'+n+'_lda_probabilisticMajority'
    new_df.write.mode("overwrite").saveAsTable(s) 


    colle = new_df.select(['label', 'prediction']).collect()

    tru_value = [int(i.label) for i in colle]#Collect the true label in a list 
    predict_value = [int(i.prediction) for i in colle] #Collect the  prediction in a list 

    #Compute evaluation measures
    meas = []
    meas.append(round(evaluation_measures(new_df),4)*100)
    meas.extend(macro(tru_value,predict_value))
    meas.extend(micro(tru_value,predict_value))
    meas.extend(weighted(tru_value,predict_value))

    measures[n] = meas
    print(meas)

    y_actu = pd.Series(tru_value, name='Actual')
    y_pred = pd.Series(predict_value, name='Predicted')
    df_confusion = pd.crosstab(y_actu, y_pred)  #Confusion Matrix
    print(df_confusion)
    conf_matices[n] = df_confusion
  

In [41]:
import pandas as pd

pd_df_fullDocument = pd.DataFrame(conf_matices['nb'])
pd_df_fullDocument=pd_df_fullDocument.set_index('Measures')
pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']]


pd_df_fullDocument

# print (pd_df_fullDocument[['nb','lr','ovr','dtc','rfc']].to_latex())

In [42]:
#Print all the confusion matrices for each algorithm in a latex format
for i  in  dict_algo.keys():
    print(str(dict_algo[i]))
    print(conf_matices[i].to_latex())