In [34]:
# EXTRACTION OF DATA FROM XML FILE
import glob
import os

import numpy
import numpy as np
from lxml import etree
import pandas as pd
from nltk.corpus import stopwords
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import preprocessing
from keras import Sequential


dataColumns = ["headline", "text", "bip:topics", "dc.date.published", "itemID", "XML_File_Name"]
clusterDataframeList = []
rows = []
paragraph = ""
bipTopicList = []
vec = CountVectorizer(stop_words=None)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, max_df=0.5, use_idf=True)
enhancedDFList = []


def dataExtraction():
    dir = '/Users/amruthkuppili/Downloads/Sample/'
    for file in glob.iglob(os.path.join(dir, '*/*.xml')):
        paragraph = ""
        bipTopicCode = ""
        path, fileName = os.path.split(file)  # Obtained File name
        data = etree.parse(file)
        root = data.getroot()
        itemId = data.getroot().attrib.get("itemid")  # Obtained item ID
        headline = data.find("headline").text
        textNode = data.find("text")
        for node in textNode:
            paragraph = paragraph + node.text  # Obtained text
        dcPublishedNode = root.findall("./metadata/dc[@element='dc.date.published']")
        if dcPublishedNode is not None:
            published_date = dcPublishedNode[0].attrib.get("value")  # obtained dc.date.published
        else:
            published_date = "NONE"
        bipNode = root.findall("./metadata/codes[@class='bip:topics:1.0']/code")
        text = removeStopWords(paragraph)  # removing stop words
        if bipNode is not None:
            for innercodes in bipNode:
                bipTopicCode = innercodes.attrib.get("code")  # obtained bip:topic code
                rows.append({"itemID": itemId, "XML_File_Name": fileName, "headline": headline, "text": text,
                             "dc.date.published": published_date, "bip:topics": bipTopicCode})
                uniqueBipTopics(bipTopicCode)
                break
        else:
            bipTopicCode = "NONE"
            rows.append({"itemID": itemId, "XML_File_Name": fileName, "headline": headline, "text": text,
                         "dc.date.published": published_date, "bip:topics": bipTopicCode})

    customDataFrame = pd.DataFrame(rows, columns=dataColumns)
    return customDataFrame


def uniqueBipTopics(topic):
    if topic not in bipTopicList:
        bipTopicList.append(topic)
    return bipTopicList


def removeStopWords(text):
    stop_words = set(stopwords.words('english'))
    text_tokens = word_tokenize(text)
    filtered_sentence_list = [w for w in text_tokens if w not in stop_words]
    filtered_lemmatized_list = lemmatization(filtered_sentence_list)
    filtered_stemmed_list = stemming(filtered_lemmatized_list)
    filtered_lemmatized_sentence = ' '.join(filtered_stemmed_list)
    return filtered_lemmatized_sentence


def stemming(sentence):
    ps = PorterStemmer()
    stemmed_words = []
    for w in sentence:
        stemmed_words.append(ps.stem(w))
    return stemmed_words


def lemmatization(filtered_sentence):
    lem = WordNetLemmatizer()
    lemmatized_words = []
    for w in filtered_sentence:
        lemmatized_words.append(lem.lemmatize(w))
    return lemmatized_words
rawDataFrame = dataExtraction()

In [35]:
rawDataFrame.head()


Unnamed: 0,headline,text,bip:topics,dc.date.published,itemID,XML_File_Name
0,Care Group Inc Q4 shr loss vs profit.,( 000 's omit ) year end decemb 31 ( audit ) 1...,C15,1997-03-31,476242,476242newsML.xml
1,China economy to grow at more than 10 pct in 1...,"china see econom growth 10 percent year , infl...",E11,1997-03-31,476768,476768newsML.xml
2,Africans say global debt relief plan too rigid.,african financi leader criticis monday new int...,E51,1997-03-31,476074,476074newsML.xml
3,Argentine export grain prices - March 31.,"export offer/bid price u.s. dollar per tonn , ...",M14,1997-03-31,476597,476597newsML.xml
4,Yemen says kidnappers not harming German hosta...,four german tourist kidnap mountain tribesman ...,GCAT,1997-03-31,476845,476845newsML.xml


In [36]:
# PERFORMING CLUSTERING ON DOCUMENTS
import warnings
warnings.filterwarnings('ignore')
def clustering(clusterDataFrame):
    docVecList = []
    textData = clusterDataFrame["text"]
    bipTopics = clusterDataFrame["bip:topics"]

    # The goal of doc2vec is to create a numeric representation of a document
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(textData)]
    max_epochs = 20
    # vec_size is used to set dimension of the vector. below vec size indicates the representation
    # of document in 20 components
    vec_size = 20
    alpha = 0.025 #Learning rate
    model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
    #dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’
    # (PV-DBOW). Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of
    # words approach, which doesn’t preserve any word order.
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha
    for i in range((len(tagged_data))):
        docVecList.append(model.docvecs[i])


    featureDataFrame = pd.DataFrame(data=docVecList)
    min_max_scaler = preprocessing.MinMaxScaler()
    scaledFeatureArray = min_max_scaler.fit_transform(featureDataFrame)
    scaledFeatureDataFrame = pd.DataFrame(data=scaledFeatureArray)
    km = KMeans(n_clusters=10) #TAKEN 10 CLUSTERS
    km.fit(scaledFeatureDataFrame)
    clusters = km.labels_
    scaledFeatureDataFrame['cluster_ID'] = clusters
    scaledFeatureDataFrame['labels'] = bipTopics
    # clusterQuality(clusters,featureData,featureDataFrame)
    # return FeatureDataFrame
    return scaledFeatureDataFrame,clusters,featureDataFrame

receivedClusterDataframe, receivedClusters, receivedFeatureData = clustering(rawDataFrame)



In [37]:
receivedClusterDataframe.head() # PRINING THE DATAFRAME AFTER CLUSTERING WITH cluster_id INCLUDED AS COLUMN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,cluster_ID,labels
0,0.384479,0.54566,0.544152,0.606544,0.519817,0.432251,0.430371,0.369594,0.463116,0.589322,...,0.626902,0.520464,0.361071,0.555195,0.587612,0.581672,0.092237,0.234211,9,C15
1,0.553251,0.345231,0.626031,0.628948,0.430824,0.735184,0.547479,0.436808,0.487193,0.82558,...,0.693457,0.606522,0.24726,0.462505,0.497276,0.423199,0.338295,0.522921,3,E11
2,0.462176,0.434969,0.626863,0.565658,0.632039,0.600619,0.482731,0.537192,0.322891,0.665864,...,0.616268,0.508939,0.196305,0.592415,0.403272,0.473848,0.240483,0.582853,6,E51
3,0.355043,0.37001,0.691268,0.447398,0.359245,0.484694,0.596686,0.448343,0.385433,0.757725,...,0.584557,0.685953,0.316508,0.493998,0.577861,0.373372,0.323239,0.227906,0,M14
4,0.481164,0.427268,0.782236,0.713602,0.543709,0.779958,0.716761,0.445762,0.472132,0.691579,...,0.520577,0.689512,0.281714,0.250167,0.360557,0.611949,0.34534,0.57559,1,GCAT


In [38]:
#DIVIDING THE DATAFRAME ACCORDING TO  CLUSTERS
def clusterProcessing(receivedClusterDataframe):
    uniqueClusterIDs = receivedClusterDataframe.cluster_ID.unique()
    uniqueClusterIDs.sort()
    for id in uniqueClusterIDs:
        clusterWiseDF = receivedClusterDataframe.loc[receivedClusterDataframe['cluster_ID'] == id]
        clusterDataframeList.append(clusterWiseDF)
    return clusterDataframeList

clusterDataframeList = clusterProcessing(receivedClusterDataframe)
frameListforEnhance = clusterDataframeList[:]


In [39]:
clusterDataframeList # Cluster data is maintained in the respective index of the List

[             0         1         2         3         4         5         6  \
 3     0.355043  0.370010  0.691268  0.447398  0.359245  0.484694  0.596686   
 12    0.377760  0.326406  0.823562  0.564738  0.600728  0.596619  0.725821   
 13    0.577574  0.228895  0.475024  0.431306  0.414522  0.712279  0.723252   
 15    0.484375  0.551437  0.608820  0.598027  0.484617  0.688766  0.730665   
 16    0.280024  0.251981  0.534861  0.570321  0.489268  0.691137  0.561637   
 ...        ...       ...       ...       ...       ...       ...       ...   
 5078  0.512711  0.171011  0.630404  0.447109  0.599412  0.512521  0.669042   
 5088  0.421803  0.309460  0.691877  0.483705  0.378172  0.526841  0.465823   
 5099  0.282320  0.582900  0.257226  0.463995  0.386089  0.524770  0.989509   
 5111  0.339258  0.396760  0.720945  0.558241  0.446837  0.685397  0.653604   
 5112  0.367857  0.359108  0.683303  0.450575  0.417432  0.522953  0.605992   
 
              7         8         9  ...        12

In [40]:
def applyClassifier(receivedDFList):
    for df in receivedDFList:
        clusterNumber = df['cluster_ID'].unique()
        clusterNumber = clusterNumber[0]
        
        Xtr, Xte, Ytr, Yte, target = trainTestSplit(df)
        #
        if clusterNumber == 0:
            # Artificial Neural Network(ANN) uses the processing of the brain as a basis 
            # to develop algorithms that can be used to model complex patterns and prediction problems.
            print("Neural Networks")
            print("========================================")
            neuralNetwork_model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                                                hidden_layer_sizes=(60,), random_state=1, max_iter=500)
            trainedClassifier = neuralNetwork_model.fit(Xtr, Ytr)
#SVM constructs a hyperplane in multidimensional space to separate different classes. 
#SVM generates optimal hyperplane in an iterative manner, which is used to minimize an error.
        elif clusterNumber == 1:
            print("SVC")
            print("========================================")
            SVC_model = SVC(kernel='sigmoid', gamma=0.1, C=0.1)
            trainedClassifier = SVC_model.fit(Xtr, Ytr)

        elif clusterNumber == 2:
            #A decision tree is a flowchart-like structure in which each internal node represents a “test” on an attribute
            # each branch represents the outcome of the test, and each leaf node represents a class label.
            #The paths from root to leaf represent classification rules.
            print("Decision Trees")
            print("========================================")
            decionTree_model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=0.2,
                                                      min_samples_leaf=0.2)
            trainedClassifier = decionTree_model.fit(Xtr, Ytr)
            
        # Random forest is an ensemble method in which a classifier is constructed by combining several different Independent base classifiers.

        elif clusterNumber == 3:
            print("Random Forest")
            print("========================================")
            randomForest_model = RandomForestClassifier(n_estimators=10, max_depth=3, min_samples_split=0.4,
                                                        min_samples_leaf=0.2)
            trainedClassifier = randomForest_model.fit(Xtr, Ytr)
        # KNN is a non-parametric and lazy learning algorithm. Non-parametric means there is no assumption for underlying data distribution.
        elif clusterNumber == 4:
            print("KNearestNeighbors")
            print("=========================================")
            KNN_model = KNeighborsClassifier(n_neighbors=5)
            trainedClassifier = KNN_model.fit(Xtr, Ytr)
        # Naive Bayes is a classification algorithm for binary (two-class) and multi-class classification problems.
        elif clusterNumber == 5:
            print("Guassian Naive Bayes")
            print("=========================================")
            GNB_model = GaussianNB()
            trainedClassifier = GNB_model.fit(Xtr, Ytr)
        # Multinomial Naive Bayes calculates likelihood to be count of an word/token 

        elif clusterNumber == 6:
            print("Guassian Multinomial Naive Bayes")
            print("=========================================")
            GMNB_model = MultinomialNB()
            trainedClassifier = GMNB_model.fit(Xtr, Ytr)

        # We use only a single training example for calculation of gradient and update parameters.
        elif clusterNumber == 7:
            print("Stochastic Gradient Descent")
            print("=========================================")
            SGD_model = SGDClassifier(loss='modified_huber',shuffle=True,random_state=101)
            trainedClassifier = SGD_model.fit(Xtr, Ytr)

        # AdaBoost is a popular boosting technique which helps you combine multiple “weak classifiers” into a single “strong classifier”.
        elif clusterNumber == 8:
            print("ADA-Boost")
            print("=========================================")
            ADAB_model = AdaBoostClassifier(n_estimators=50,learning_rate=1)
            trainedClassifier = ADAB_model.fit(Xtr, Ytr)
        #It’s a sub-class of ensemble machine learning algorithms wherein we use multiple weak models and aggregate the predictions we get from each of them to get the final prediction. 
        elif clusterNumber == 9:
            print("Bagging")
            print("=========================================")
            Bagging_model = BaggingClassifier(n_estimators=50)
            trainedClassifier = Bagging_model.fit(Xtr, Ytr)


        else:
            print("end")
            break
            
        calculateMetrics(trainedClassifier, Xte, Yte)
    return target
        



def trainTestSplit(dataFrame):
    target = dataFrame['labels']
    splitDF = dataFrame.iloc[:,:-1]
    X_train, X_test, y_train, y_test = train_test_split(splitDF, target, test_size=0.2, random_state=101)
    return X_train, X_test, y_train, y_test, target

def calculateMetrics(trainedReceivedClassifier, XtestData, YtestData):
    predictor = trainedReceivedClassifier.predict(XtestData)
    confusionMatrix = confusion_matrix(predictor, YtestData)
    accuracy = accuracy_score(predictor, YtestData)
    ClassificationReport = classification_report(predictor, YtestData)
    print("Accuracy : ", accuracy)
    print("Classification Report :")
    print(ClassificationReport)
    
    
target = applyClassifier(frameListforEnhance)

Neural Networks
Accuracy :  0.6788990825688074
Classification Report :
              precision    recall  f1-score   support

         C11       0.00      0.00      0.00         1
         C12       0.00      0.00      0.00         0
         C13       0.29      0.67      0.40         3
         C15       0.00      0.00      0.00         0
         C21       0.38      0.25      0.30        12
         C22       0.00      0.00      0.00         0
         C24       0.38      0.38      0.38         8
         C31       0.17      0.14      0.15         7
         C33       0.00      0.00      0.00         2
         E13       0.00      0.00      0.00         1
         E51       0.67      0.50      0.57         4
        E512       0.00      0.00      0.00         0
        GCAT       1.00      1.00      1.00         1
         M12       0.00      0.00      0.00         1
         M13       0.00      0.00      0.00         0
         M14       0.91      0.90      0.91        69

    accur

Accuracy :  0.967741935483871
Classification Report :
              precision    recall  f1-score   support

        GCAT       1.00      0.97      0.98        31
         M14       0.00      0.00      0.00         0

    accuracy                           0.97        31
   macro avg       0.50      0.48      0.49        31
weighted avg       1.00      0.97      0.98        31

Bagging
Accuracy :  0.875
Classification Report :
              precision    recall  f1-score   support

         C13       0.00      0.00      0.00         0
         C15       1.00      0.90      0.95        48
         C17       0.00      0.00      0.00         2
         C18       0.00      0.00      0.00         0
         C31       0.00      0.00      0.00         0
         E21       0.93      0.93      0.93        14
         E51       0.00      0.00      0.00         0
         E71       1.00      1.00      1.00         5
         M12       0.50      1.00      0.67         1
         M14       1.00     

In [41]:
def clusterQuality(receivedclusters,receivedfeatureData,receivedfeatureDataFrame):
    receivedbipTopics = receivedfeatureDataFrame.labels
    # The knowledge of ground truth classes is known and hence the measure taking those classes into consideration is used
    # To compare both evaluation based on ground truth tables and independent of them, silhouette score is used
    # which resulted in biased scoring.
    
    
    print("Cluster Quality Scores")
    print("with ground truth labels")
    print("==========================")
    
    #The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples 
    #and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
    print("Adjusted Rand index")
    print(metrics.adjusted_rand_score(receivedbipTopics, receivedclusters))
    
    
    #The Mutual Information is a measure of the similarity between two labels of the same data.
    #This Mutualinformation score is useful to check whether the clustering algorithm meets an important requirement:
    #a cluster should contain only samples belonging to a single class.
    print("Mutual Information based scores")
    print(metrics.adjusted_mutual_info_score(receivedbipTopics, receivedclusters))
    
    
    #A perfectly homogeneous clustering is one where each cluster has data-points belonging to the same class label. 
    #Homogeneity describes the closeness of the clustering algorithm to this perfection.
    print("Homogeneity score")
    print(metrics.homogeneity_score(receivedbipTopics, receivedclusters))
    
    
    #Completness score purpose is to provide a piece of information about the assignment of samples belonging to the same class.
    #More precisely, a good clustering algorithm should assign all samples with the same true label to the same cluster.
    print("completeness score")
    print(metrics.completeness_score(receivedbipTopics, receivedclusters))
    
    
    #The V-Measure is defined as the harmonic mean of homogeneity and completeness of the clustering
    # No assumption is made on the cluster structure: can be used to compare clustering algorithms such as k-means which assumes isotropic 
    # blob shapes with results of spectral clustering algorithms which can find cluster with “folded” shapes.
    print("V Measure Score")
    print(metrics.v_measure_score(receivedbipTopics, receivedclusters))
    
    
    #The Fowlkes-Mallows Score is an evaluation metric to evaluate the similarity among clusterings obtained after applying different clustering algorithms. 
    print("Fowlkes-Mallows scores")
    print(metrics.fowlkes_mallows_score(receivedbipTopics, receivedclusters))
    print("\n\n")
    print("without ground truth labels")
    print("============================")
    print("silhouette")
    print(metrics.silhouette_score(receivedfeatureData, receivedclusters, metric='euclidean'))
    
clusterQuality(receivedClusters, receivedFeatureData, receivedClusterDataframe)

Cluster Quality Scores
with ground truth labels
Adjusted Rand index
0.15939412457482804
Mutual Information based scores
0.2851248324630502
Homogeneity score
0.29529300393510155
completeness score
0.38985191589515333
V Measure Score
0.3360472799332894
Fowlkes-Mallows scores
0.2579530261148003



without ground truth labels
silhouette
0.06415026681464157


In [42]:
#Here Auto encoder is used for feature extraction.
#Autoencoder is an unsupervised artificial neural network that learns how to efficiently compress and encode data then learns how to reconstruct the data back from the reduced encoded representation to a representation that is as close to the original input as possible.
#Autoencoder, by design, reduces data dimensions by learning how to ignore the noise in the data.
#It consists of 4 parts , one is encoder in which model learns how to reduce the input dimensions and compress the input data into encoded representation.
#Bottle neck , this layer contains the compressed representation of nput data
#Decoder , in which model learns how to recontruct the data from encoded representation to be close to the original input as possible.
#I decided to use ReLu as the activation function for the encoding stage and Softmax for the decoding stage.
#In here I have declared 3 hidden layers in the encoded stage and 3 hidden layers in the decoded stage.

def enhancedFeatureExtraction(receivedFeatureData):
    print("Enhancing Features using AutoEncoder..........")
    x = Input(shape=(receivedFeatureData.shape[1],))
    # 3 hidden layers are implemented
    hidden_1en = Dense(2048, activation='relu')(x)
    hidden_2en = Dense(1024, activation='relu')(hidden_1en)
    hidden_3en = Dense(512, activation='relu')(hidden_2en)
    h = Dense(128, activation='relu')(hidden_3en)
    hidden_1dec = Dense(512, activation='relu')(h)
    hidden_2dec = Dense(1024, activation='relu')(hidden_1dec)
    hidden_3dec = Dense(2048, activation='relu')(hidden_2dec)
    r = Dense(receivedFeatureData.shape[1], activation='sigmoid')(hidden_3dec)
    autoencoder = Model(x, r)
    autoencoder.compile(optimizer='adam', loss='mse')
    Xtraut, Xteaut, Ytraut, Yteaut = train_test_split(receivedFeatureData, receivedFeatureData, test_size=0.3, random_state=101)
    autoencoder.fit(Xtraut, Ytraut,
                    epochs=30,
                    batch_size=200,
                    shuffle=True,
                    verbose=0,
                    validation_data=(Xteaut, Yteaut))
    compressedData = autoencoder.predict(receivedFeatureData)
    return compressedData

for lf in clusterDataframeList:
 enhancedDFrame = lf.iloc[:, :-2]
 clusterandLabel = lf.iloc[:, -2:]
 compressedFrame = enhancedFeatureExtraction(enhancedDFrame) # PERFORMING AUTO ENCODER FOR DIFFERENT CLUSTER DATA
 compressedDataFrame = pd.DataFrame(data=compressedFrame)
 clusterandLabel.reset_index(drop=True, inplace=True)
 compressedDataFrame.reset_index(drop=True, inplace=True)
 compressedDataFrame = pd.concat([compressedDataFrame,clusterandLabel],axis=1)
 enhancedDFList.append(compressedDataFrame) 
    
print(enhancedDFList)# PRINTING THE EXTRACTED FEATURES USING AUTO ENCODER

Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
[            0         1         2         3         4         5         6  \
0    0.385522  0.361760  0.664825  0.492591  0.445960  0.558124  0.647401   
1    0.379567  0.276782  0.727491  0.484896  0.448895  0.519895  0.643062   
2    0.387488  0.295412  0.709807  0.487679  0.450273  0.519474  0.635800   
3    0.382898  0.298773  0.712127  0.488030  0.449913  0.529625  0.641925   
4    0.390469  0.328210  0.686808  0.490915  0.452032  0.535644  0.634825   
..        ...       ...       ...       ...       ...       ...    

In [43]:
applyClassifier(enhancedDFList) # Comparing performance after using autoencoder

Neural Networks
Accuracy :  0.6513761467889908
Classification Report :
              precision    recall  f1-score   support

         C11       0.00      0.00      0.00         0
         C12       0.00      0.00      0.00         0
         C13       0.14      1.00      0.25         1
         C15       0.00      0.00      0.00         0
         C21       0.25      0.25      0.25         8
         C22       0.00      0.00      0.00         0
         C24       0.12      1.00      0.22         1
         C31       0.00      0.00      0.00         1
         C33       0.00      0.00      0.00         0
         E51       0.00      0.00      0.00         0
        E512       0.00      0.00      0.00         0
        GCAT       0.00      0.00      0.00         0
         M13       0.00      0.00      0.00         0
         M14       0.99      0.68      0.81        98

    accuracy                           0.65       109
   macro avg       0.11      0.21      0.11       109
weighted 

Accuracy :  0.967741935483871
Classification Report :
              precision    recall  f1-score   support

        GCAT       1.00      0.97      0.98        31
         M14       0.00      0.00      0.00         0

    accuracy                           0.97        31
   macro avg       0.50      0.48      0.49        31
weighted avg       1.00      0.97      0.98        31

Bagging
Accuracy :  0.8611111111111112
Classification Report :
              precision    recall  f1-score   support

         C13       0.00      0.00      0.00         0
         C15       0.98      1.00      0.99        42
         C17       1.00      0.67      0.80         3
         C18       0.00      0.00      0.00         0
         C21       0.00      0.00      0.00         1
         C31       0.00      0.00      0.00         0
         E14       0.00      0.00      0.00         1
         E21       0.93      0.72      0.81        18
         E51       0.00      0.00      0.00         0
         E71   

0      C15
1      C15
2      C15
3      C15
4      E71
      ... 
353    E21
354    E71
355    C15
356    C15
357    C15
Name: labels, Length: 358, dtype: object

In [47]:
# Major Differences
# Deep Neural Network using 3 layers is implemented and moreover the feature extraction is enhanced using auto encoder
# Doc2Vec is used for vectorization of documents apart from assignment one as TF-IDF generates sparse matrix which is ineffic
# Neural networks with 3 hidden layers are used to enhance the performance.
def deepNeuralNet(rXtr, rXte, rYtr, rYte):
    print("Deep Neural Network using Enhanced Features")
    le = LabelEncoder()
    rYtr = le.fit_transform(rYtr)
    rYte = le.fit_transform(rYte)
    dup = numpy.unique(rYtr)
    classifier = Sequential()
    # First Hidden Layer
    classifier.add(Dense(512, activation='relu', input_dim=rXtr.shape[1]))
    classifier.add(Dropout(0.5))
    # Second  Hidden Layer
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    # Output Layer
    classifier.add(Dense(dup.size, activation='softmax'))
    classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    classifier.fit(rXtr, rYtr,validation_data=(rXte, rYte), batch_size=1024, epochs=10)
    loss,accuracy = classifier.evaluate(rXtr,rYtr)
    print("accuracy for Deep Neural Networks is")
    print(accuracy)
    print("Loss in Deep Neural Networks is")
    print(loss)
    
def trainTestSplit(dataFrame):
    targetF = dataFrame['labels']
    splitDFF = dataFrame.iloc[:,:-1]
    X_train, X_test, y_train, y_test = train_test_split(splitDFF, targetF, test_size=0.2, random_state=101)
    return X_train, X_test, y_train, y_test
    
Xtr, Xte, Ytr, Yte = trainTestSplit(enhancedDFList[0])
deepNeuralNet(Xtr, Xte, Ytr, Yte)

Deep Neural Network using Enhanced Features
Train on 434 samples, validate on 109 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy for Deep Neural Networks is
0.5921658997162147
Loss in Deep Neural Networks is
1.7848782363575175
