In [1]:
from pyspark.ml.feature import (RegexTokenizer, Tokenizer, HashingTF, IDF,
                                StopWordsRemover, CountVectorizer, StopWordsRemover, StringIndexer, OneHotEncoder)
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                  MulticlassClassificationEvaluator)
from pyspark.sql.types import (LongType ,StringType, IntegerType,
                               FloatType, DoubleType, ArrayType)
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split
from pyspark.ml.clustering import LDA
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import countDistinct
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import HiveContext
from pyspark.sql import functions as F


## Load DOIs and Datasets

In [3]:
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split

#Load 4 Categories Mapping
df_doiCateg = spark.sql("SELECT * FROM taxiarchis.doi_categories_4")

# df_doiCateg.count()
# indexer = StringIndexer(inputCol="Category", outputCol="label")
indexer = StringIndexer(inputCol="Categories", outputCol="label")
indexed = indexer.fit(df_doiCateg).transform(df_doiCateg)

#transfrom to pandas
df_cat = indexed.toPandas()

#Split the DOIs for training and Test-----randomstate =42
X_train, X_test, y_train, y_test = train_test_split(df_cat['DOI'],df_cat['label'] , test_size=0.3, random_state=42)



In [4]:
#Full Document
# df = spark.sql("SELECT * FROM taxiarchis.fulldocument_4")
# df = df.dropna()

#Per Section Documents
df = spark.sql("SELECT * FROM taxiarchis.persection_4_1")
df = df.dropna()

df.count()


## TF-IDF

In [6]:
def tfidf_fullDocument(df):
    regexTokenizer = RegexTokenizer(inputCol="Full_Document", outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='count_vec')
    idf = IDF(inputCol='count_vec', outputCol='features')

    #STRINGINDEXER THE DOC_ID

    int_category = StringIndexer(inputCol='Categories',outputCol='label')
    int_sections = StringIndexer(inputCol = 'DOI', outputCol ='uni_id')


    prep_pipeline = Pipeline(stages = [int_category, int_sections, regexTokenizer, remover, count_vec, idf])
    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    return pre_processed_data.select(['Categories','uni_id','DOI','label','features'])


# final_data = tfidf_fullDocument(df)
# final_data.count()

In [7]:
def prepare_tfidf_section(df):
    '''
    Vectorize sections in tf-idf text
    Representation.
    '''

    #FOR THE FULL DOCUMENT NEED TO BE CHANGED!!

    regexTokenizer = RegexTokenizer(inputCol="Section", outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='count_vec')
    idf = IDF(inputCol='count_vec', outputCol='features',minDocFreq=2)

    #STRINGINDEXER THE DOC_ID

    int_category = StringIndexer(inputCol='Category',outputCol='label')
    int_sections = StringIndexer(inputCol = 'doc_id', outputCol ='uni_sec_id')


    prep_pipeline = Pipeline(stages = [int_category, int_sections, regexTokenizer, remover, count_vec, idf])
    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    return pre_processed_data.select(['doc_id','uni_sec_id','DOI','label','features']).cache()


## LDA

In [9]:
def LDA_fullDocument(df,numTopics = 30):
    '''
    Vectorize full document's text by representing them 
    with lda features. The function returns a dataframe.
    '''
    int_category = StringIndexer(inputCol='Categories',outputCol='label')
    int_id = StringIndexer(inputCol='DOI',outputCol='id') #map each unique document with an id
    regexTokenizer = RegexTokenizer(inputCol="Full_Document", outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='features')

    #PIPELINE
    prep_pipeline = Pipeline(stages = [int_category, int_id, regexTokenizer, remover, count_vec])
    pre_processing = prep_pipeline.fit(df)
    final_data_LDA = pre_processing.transform(df).cache()

    final_data_LDA = final_data_LDA.dropDuplicates(['id'])
    final_data_LDA = final_data_LDA.select(['DOI','id','features','Full_Document','Categories','label'])


    # Trains a LDA model.
    lda = LDA(k=numTopics, maxIter=50, optimizer="em")
    model = lda.fit(final_data_LDA.select('id','features'))


    #Create LDA features
    transformed = model.transform(final_data_LDA)
    return transformed.select(['DOI','label','topicDistribution']).withColumnRenamed("topicDistribution", "features").cache()

final_data = LDA_fullDocument(df)
final_data.count()

In [10]:
display(final_data)

# LDA - Section Representation

In [12]:
def lda_representation_sections(df,numTopics = 30):
    '''
    Vectorize documents of sections by representing them 
    in lda text representation. The function returns a dataframe.
    '''

    #FOR THE FULL DOCUMENT NEED TO BE CHANGED!!
    regexTokenizer = RegexTokenizer(inputCol='Section', outputCol="tokens", pattern="\\W")
    remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
    count_vec = CountVectorizer(inputCol='filtered', outputCol='features')

    int_category = StringIndexer(inputCol='Category',outputCol='label')
    int_id = StringIndexer(inputCol='doc_id',outputCol='id') #map each unique document with an id
    #PIPELINE


    prep_pipeline = Pipeline(stages = [int_category, int_id, regexTokenizer, remover, count_vec])

    pre_processing = prep_pipeline.fit(df)
    pre_processed_data = pre_processing.transform(df)

    final_data_LDA = pre_processed_data.cache()

    final_data_LDA = final_data_LDA.dropDuplicates(['id'])
    # final_data_LDA = final_data_LDA.select(['id','DOI','contENT','features','Section','Category','Sections','label'])

    # Trains a LDA model.
    lda = LDA(k=numTopics, maxIter=50, optimizer="em")
    model = lda.fit(final_data_LDA.select('id','features'))


    #Create LDA features
    transformed = model.transform(final_data_LDA)
    return transformed.select(['DOI','label','topicDistribution']).withColumnRenamed("topicDistribution", "features").cache()


## Evaluation Measures

In [14]:
def evaluation_measures(new_df_p):
    '''
    Predict all the evaluation Measures:
    Accuracy, F1-Score, Recall, Precision
    '''
    
    ev_m = []
    
    #ACCURACY
    acc_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName = 'accuracy')
    accuracy = acc_eval.evaluate(new_df_p)
    print("Accuracy = %g" % accuracy)
    ev_m.append(accuracy)
    
    #F1-SCORE
    acc_eval_f1 = MulticlassClassificationEvaluator(metricName = 'f1')
    f1_score = acc_eval_f1.evaluate(new_df_p)
    print("f1 = %g" % f1_score)
    ev_m.append(f1_score)
    
    #RECALL
    acc_eval_recall = MulticlassClassificationEvaluator(metricName = 'weightedRecall')
    recall = acc_eval_recall.evaluate(new_df_p)
    print("weightedRecall = %g" % recall)
    ev_m.append(recall)
    
    #PRECISION
    acc_eval_precission = MulticlassClassificationEvaluator(metricName = "weightedPrecision")
    precission = acc_eval_precission.evaluate(new_df_p)
    print("weightedPrecision = %g" % precission)
    ev_m.append(precission)
    
    return ev_m

## Machine Learning Algorithms Hyper-parameters Tuned

In [16]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                      DecisionTreeClassifier)
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.classification import LogisticRegression

#Multinomial Naive Bayes
nb = NaiveBayes()

#Decision Tree - Random Forest 
dtc = DecisionTreeClassifier(maxDepth = 12)
# maxDepth = 15
rfc = RandomForestClassifier(numTrees = 200, maxDepth = 18)

#Linear SVM - One vs All
lsvc = LinearSVC(maxIter=10, regParam=0.2)
ovr = OneVsRest(classifier=lsvc  )

#Logistic Regression
lr = LogisticRegression(maxIter=15, regParam=0.0, elasticNetParam=0.0 )



## Split Training - Test sets

In [18]:
def split_train_test(final_data):
    #train
    train = final_data.filter(final_data.DOI.isin(list(X_train)))

    #test
    test = final_data.filter(final_data.DOI.isin(list(X_test)))
    return (train, test)

# train, test = split_train_test(final_data)
# print(train.count())
# print(test.count())

# Majority Vote Probabilities

In [20]:
import numpy as np
import random 
from pyspark.sql.functions import udf, max
from pyspark.sql.types import LongType,StringType,IntegerType, FloatType,DoubleType
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


def find_maximum(test_results):
    '''
    Finds the maximum probability and 
    prepares the dataframe for the Majority Vote.
    '''

    udf_wf_var = udf(lambda x: round(np.max(np.array(x)),10), returnType=FloatType()) #Define UDF function
    df =  test_results.withColumn('WF_Var',udf_wf_var('probability')).dropna()

    return df.groupby("DOI").agg(F.collect_list('prediction').alias('list_categ'),F.collect_list('WF_Var').alias('prob_categ'))


def majority_vote_percent(pre,prob):
    '''
    Predict the majority vote based on the given 
    probabilities  of the classifier by
    returning randomly a class when we have more
    than one winner.
    '''
    
    maj_vote ={}

    for prdi, prba in zip(pre,prob):
        
        
        if prdi in maj_vote:
            maj_vote[prdi] += prba    
        else:
            maj_vote[prdi] = prba
        
    winners = []
    max_count = np.max(maj_vote.values())  
    for vote, count in maj_vote.items():
        if count == max_count:
            winners.append(vote)
    
    return random.choice(winners)
  
def max_probability_class(pre,prob):
    '''
    Returns the class with the maximum 
    section probability 
    '''
    max_pro = 0.0
    max_class = 0
    for prdi, prba in zip(pre,prob):
    if prba > max_pro:
        max_pro = prba
        max_class = prdi

    return max_class
  
    
def majority_final_prediction(test_results):
    '''
    Makes the final prediction based on the majority vote
    results of probabilities.
    '''
    #Find Maximum Probability and prepare dataframe for majority vote
    df_for_majority_vote = find_maximum(test_results)

    #create UDF function
    majority_function_probab = udf(majority_vote_percent, DoubleType())

    final_prediction=df_for_majority_vote.withColumn('prediction',majority_function_probab(df_for_majority_vote.list_categ, df_for_majority_vote.prob_categ))

    # Inner Merge with real Categories
    return final_prediction.join(indexed , on=['DOI'], how='inner').dropDuplicates().dropna().cache()



# Simple Majority Vote

In [22]:
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql import functions as F
import numpy as np
import random 
from pyspark.sql.types import IntegerType,DoubleType
from pyspark.sql.functions import col


def prepare_rand_majority(df):
    '''
    Return a dataframe which contains
    in each row a document and a list of
    the predictions of the classifier prepared 
    for the majority vote.
    '''
    return  df.groupby("DOI").agg(F.collect_list("prediction").alias('list_categ'))


def majority_vote_random(votes):
    '''
    Predict the majority vote by
    returning randomly when we have more
    than one winner class.
    '''

    vote_counts = {}
    
    for vote in votes:
        if vote in vote_counts:
            vote_counts[vote] += 1
        else:
            vote_counts[vote] = 1
    winners = []
    max_count = np.max(vote_counts.values())
    
    for vote, count in vote_counts.items():
        if count == max_count:
            winners.append(vote)
            
    return random.choice(winners)

  
def final_prediction_maj_random(df):
    '''
    Computes the final predictions based
    on a simple majority vote.
    '''
    df_for_majority_vote = prepare_rand_majority(df)
    #udf apply function
    majority_function = udf(lambda z: majority_vote_random(z), DoubleType())
    f_res = df_for_majority_vote.withColumn('prediction', majority_function(col('list_categ')))
    return f_res.join(indexed , on=['DOI'], how='inner').dropDuplicates().dropna().cache()



# Algorithms - Cross Validation - Hyper-parameter Tuning

# Multinomial Logistic Regression

In [25]:

acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')

paramGrid_lr = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.0, 0.5,0.1,0.15, 0.2, 0.3]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 0.1,0.15, 0.2, 0.3]) # Elastic Net Parameter (Ridge = 0)
#              .addGrid(lr.maxIter, [10, 20, 25]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 3k-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid_lr, \
                    evaluator=acc_eval, \
                    numFolds=3)

In [26]:

trainingSummary = paper_class.bestModel
print 'Best Param (regParam): ', trainingSummary._java_obj.getRegParam()
print 'Best Param (MaxIter): ', trainingSummary._java_obj.getMaxIter()
print 'Best Param (elasticNetParam): ', trainingSummary._java_obj.getElasticNetParam()

##Linear SVM cross validation

In [28]:
lsvc = LinearSVC(maxIter=15, regParam=0.15)

acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')

paramGrid_lsvc = (ParamGridBuilder()
             .addGrid(lsvc.regParam, [0.0, 0.5,0.1,0.15, 0.2, 0.25, 0.3,0.4]) # regularization parameter
             .addGrid(lsvc.maxIter, [10,15, 20, 25]) #Number of iterations
             .build())

# Create 3k-fold CrossValidator
cv_lsvc = CrossValidator(estimator=lsvc, \
                    estimatorParamMaps=paramGrid_lsvc, \
                    evaluator=acc_eval, \
                    numFolds=2)

paper_class_lsvc = cv_lsvc.fit(train)
test_results = paper_class_lsvc.transform(test)

new_df = final_prediction_maj_random(test_results).cache() 
pre_eval_ms= evaluation_measures(new_df)
print(pre_eval_ms)

In [29]:
trainingSummary = paper_class_lsvc.bestModel
print 'Best Param (regParam): ', trainingSummary._java_obj.getRegParam()
print 'Best Param (MaxIter): ', trainingSummary._java_obj.getMaxIter()


# Cross Validation For Decision Tree classifier

In [31]:
acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')

paramGrid_dtc = (ParamGridBuilder()
             .addGrid(dtc.maxDepth , [5,10,12,14,15,20,25]) # max tree depth
             .build())

# Create 2 -fold CrossValidator
cv_dtc = CrossValidator(estimator=dtc, \
                    estimatorParamMaps=paramGrid_dtc, \
                    evaluator=acc_eval, \
                    numFolds=2)

In [32]:
train, test = split_train_test(final_data)
train = train.coalesce(10).cache()
test = test.coalesce(10).cache()

In [33]:
paper_class_dtc = cv_dtc.fit(train)
test_results = paper_class_dtc.transform(test)
new_df = majority_final_prediction(test_results).cache()
# new_df = final_prediction_maj_random(test_results).cache() 
pre_eval_ms= evaluation_measures(new_df)
print(pre_eval_ms)

In [34]:
trainingSummary = paper_class_dtc.bestModel
print 'Best Param (maxDepth): ', trainingSummary._java_obj.getMaxDepth()

# Cross Validation For Random Forest

In [36]:
acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')
rfc = RandomForestClassifier(numTrees = 50, maxDepth =20)
paramGrid_rfc = (ParamGridBuilder()
#              .addGrid(rfc.maxDepth , [10,12,13,15,20])
             .addGrid(rfc.numTrees , [200,220,240])
             .build())

# Create 2k-fold CrossValidator
cv_rfc = CrossValidator(estimator=rfc, \
                    estimatorParamMaps=paramGrid_rfc, \
                    evaluator=acc_eval, \
                    numFolds=2)

paper_class_rfc = cv_rfc.fit(train)
test_results = paper_class_rfc.transform(test)
#   new_df = final_prediction_maj_random(test_results).cache()  #for simple majority vote

new_df = majority_final_prediction(test_results).cache()    #for majority vote probabilities
pre_eval_ms= evaluation_measures(new_df)
myFormattedList = [ '%.4f' % elem for elem in pre_eval_ms ]
# measures[n] = myFormattedList
print(pre_eval_ms)


In [37]:
trainingSummary = paper_class_rfc.bestModel
# print 'Best Param (maxDepth): ', trainingSummary._java_obj.getMaxDepth()
print 'Best Param (numTrees): ', trainingSummary._java_obj.getNumTrees()

# LDA tuning - Number of topics

In [39]:
dict_algo={'nb':'Mult. Naive Bayes',
           'lr':'Mult. Logistic Regression',
           'dtc':'Decision Tree',
           'rfc':'Random Forest',
           'ovr':'One vs Rest Linear SVM'}
accuracy = {'LDA Topics':[],
            'Mult. Naive Bayes':[],
            'One vs Rest Linear SVM':[],
            'Mult. Logistic Regression':[],
            'Decision Tree':[],
            'Random Forest':[]    
}

n_topics = [20,30,50,70,100]
final_l = []

algorithms = [nb,dtc, rfc,lr,ovr]
names = [ "nb","dtc",'rfc', 'lr','ovr']

for topic in n_topics:
    accuracy['LDA Topics'].append(topic) # append with number of topics

    #train LDA for features production
    print('Train the LDA Algorithm.....')
    print('Train the LDA Algorithm for {} topics'.format(topic))
    final_data = lda_representation_sections(df,numTopics = topic)
    print('LDA Algorithm Trained')
    #Split tarin test set 
    train, test = split_train_test(final_data)
    train = train.cache()
    test = test.cache()

    measures = {'Measures':['Accuracy','F1-Score','Recall', 'Precision']}

    for algo, n in zip(algorithms,names):
    
        print(dict_algo[n])
        paper_class_2 = algo.fit(train)
        test_results = paper_class_2.transform(test)
        new_df = final_prediction_maj_random(test_results).cache() 
        #     new_df = majority_final_prediction(test_results).cache()    
        pre_eval_ms= evaluation_measures(new_df)
        myFormattedList = [ '%.4f' % elem for elem in pre_eval_ms ]
        measures[n] = myFormattedList
        print(pre_eval_ms)
    
    accuracy[dict_algo[n]].append(myFormattedList[0]) #append with the accuracy result
  #keep the results in a final list
  final_l.append(measures)

# All together- make predictions - Experiments

In [41]:
df_plot = pd.DataFrame(accuracy)
df_plot.set_index('LDA Topics')


## Visualize

In [None]:
# ax = df_plot.set_index('LDA Topics').plot(figsize=(10,6), grid=True,style='.-',fontsize =12)
ax.set_ylabel("Accuracy",fontsize=14)
ax.set_xlabel("LDA Topics",fontsize=14)
ax.legend(loc='right')
display(ax.figure)