In [None]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# Importing All Required Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utilities as ut
import nltk
import random

from LoadDataset import LoadReutersDataset

from nltk import pos_tag

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, cross_val_score, KFold

from tensorflow.keras.preprocessing import text, sequence

from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Flatten, Dropout, Conv1D, MaxPooling1D, AveragePooling1D
from keras.optimizers import Adam
from keras.models import load_model


# from google.colab import drive, files


# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

# # Download GloVe embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

# drive.mount('/content/drive')


# Import functions

In [None]:
# Define the function to remove duplicates from a list
def remove_duplicates_terms(doc):
    return list(set(doc))

In [None]:
# join tokens to make a string
def join_tokens(tokens):
   return " ".join(tokens)

In [None]:
# replace topics with their indexes, unfound topics are replaced by zero!
def replace_with_index(topic_lst):
    return [favorite_topics.index(topic) if topic in favorite_topics else 99 for topic in topic_lst]

In [None]:
# remove zeros from a list of topics' indexes
def remove_zeros(topic_lst):
    return [topic for topic in topic_lst if topic != 99]

In [None]:
def vectorize(vocab_vec, tokens_lst):
    return [vocab_vec[term] for term in tokens_lst if term in vocab_vec]

In [None]:
def remove_high_correlated_tokens(cosine_sim_score, tokens_lst):
    to_remove = set()
    for term_1 in tokens_lst:
        for term_2 in tokens_lst:
            if term_1 != term_2 and cosine_sim_score[term_1, term_2] > 0.9:
                    to_remove.add(term_2)
    return [term for term in tokens_lst if term not in to_remove]

In [None]:
def lemmatize(tokenized_text):
    # Perform POS tagging
    pos_tags = nltk.pos_tag(tokenized_text)

    # Initialize WordNetLemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    # Lemmatize words using POS tags
    lemmas = []
    for word, pos_tag in pos_tags:
        # Map POS tags to WordNet tags
        if pos_tag.startswith('N'):
            wn_tag = 0  # Noun
        elif pos_tag.startswith('V'):
            wn_tag = 1  # Verb
        elif pos_tag.startswith('J'):
            wn_tag = 2  # Adjective
        elif pos_tag.startswith('R'):
            wn_tag = 3  # Adverb
        else:
            wn_tag = 4  # No specific tag

        # Lemmatize the word with WordNet
        lemmas.append(wn_tag)

    return lemmas

In [None]:
def white_space_splitter(text):
    return text.split()

In [None]:
def padding(lst, maximum_length):
    return sequence.pad_sequences([lst], maxlen=maximum_length, padding='post')[0]

In [None]:
def make_tuple(lst, size):
    list_size = len(lst)
    if size == 2:
        return [(lst[index], lst[index+1]) for index in range(list_size - size + 1)]
    elif size == 3:
        return [(lst[index], lst[index+1], lst[index+2]) for index in range(list_size - size + 1)]
    else:
        raise ValueError("Error in the value of the size! Check the method!")

In [None]:
def get_unique_token_pairs(lst_docs):
    unique_pairs = []
    for doc in lst_docs:
        unique_pairs.extend(list(set(doc)))
    return list(set(unique_pairs))

In [None]:
def get_token_pairs_count(token_pair, lst):
    if token_pair in lst:
        return lst.count(token_pair)
    else:
        return 0

In [None]:
# create a list of elements from columns of DF
def create_list(row):
    elements = row.iloc[:].tolist()
    return elements

In [None]:

def feature_selection_rfe(X_train, y_train):
    # Create the SVR estimator
    svr_estimator = LinearSVR(max_iter = 10000)

    # Create the RFE object with desired estimator and number of features to select
    selector = RFE(estimator=svr_estimator, n_features_to_select=0.1, step=100)

    # Fit the RFE on the training data
    selector.fit(X_train, y_train)

    # Extract the selected features based on RFE rankings
    feature_ranks = selector.ranking_

    # Get the selected features based on model performance
    selected_features = X_train.columns[selector.support_]

    return selected_features

In [None]:
def get_number_of_tokens(df_col):
    all_items = [item for sublist in df_col for item in sublist]

    # Step 2: Convert to a set to find unique items
    unique_items = set(all_items)

    # Step 3: Count the number of unique items
    return len(unique_items)

In [None]:
def concatenate_arrays(columns, row):
    concatinated = np.array([])
    for col in columns:
        concatinated = np.append(concatinated, row[col])
    return concatinated

In [None]:
def GloVe_embedding(word_index_dict, vocab_size, embedding_dim):

    embedding_dim = 100
    embedding_file = 'glove.6B.100d.txt'
    embeddings_index = {}

    with open(embedding_file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, index in word_index_dict.items():
        if index < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

    return embedding_matrix

# **Import Data**

## Import ag_news

In [None]:
dataset_path = '/content/'
train = pd.read_parquet(dataset_path + 'train.parquet')
test = pd.read_parquet(dataset_path + 'test.parquet')

train.columns = ['doc', 'label']
test.columns = ['doc', 'label']
train.index.name = 'index'
test.index.name = 'index'

FileNotFoundError: [Errno 2] No such file or directory: '/content/train.parquet'

In [None]:
documents = pd.DataFrame(train['doc'], index=train.index, columns=['doc'])
topics = pd.DataFrame(train['label'], index=train.index, columns=['label'])

In [None]:
test_documents = pd.DataFrame(test['doc'], index=test.index, columns=['doc'])
test_topics = pd.DataFrame(test['label'], index=test.index, columns=['label'])

## Import reuters

In [None]:
dataset_path = '/content/drive/MyDrive/ColabNotebooks'
loader = LoadReutersDataset(data_path=dataset_path + '/reuters21578')
documents_dic, topics_dic, _, _, _, _, _ = loader.load()

documents = pd.DataFrame.from_dict(documents_dic, orient='index', columns=['doc'])
topics = pd.DataFrame.from_dict(topics_dic, orient='index')

# # If you want to name the index, you can set the index name
documents.index.name = 'index'
topics.index.name = 'index'

*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-000.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-001.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-002.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-003.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-004.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-006.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-005.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-007.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-008.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-009.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-011.sgm
*** opening file:  /content/drive/MyDrive/ColabNotebooks/reuters21578/reut2-

# **Filter Data**

In [None]:
# remove all the documents without any specific topic
documents = documents[topics.notna().any(axis=1)]
topics = topics[topics.notna().any(axis=1)]

# filter documents and keep only the ones with favorite topics
# favorite_topics = ['acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn', 'oilseed']
favorite_topics = ['acq', 'corn', 'crude', 'earn']
documents = documents[topics.isin(favorite_topics).any(axis=1)]
topics = topics[topics.isin(favorite_topics).any(axis=1)]

# **Sampling**

In [None]:
rand_sample = 2000
documents = pd.DataFrame(documents.sample(n=rand_sample, random_state=42, replace=False))
topics = pd.DataFrame(topics.loc[documents.index])

In [None]:
# num_samples = 2000
# topics_count = []
# for topic in favorite_topics:
#     related_topic_doc_index = list[topics.index[topics.applymap(lambda x: x == 'acq').any(axis=1)]]
#     num_samples * len(related_topic_doc_index) / len(topics)
#     topics_count.append()

# Save Random Dataset

In [None]:
# documents.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='documents', mode='w')
# topics.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='topics', mode='a')

# documents.to_csv('/content/drive/My Drive/sampled_documents.csv', index=True)
# topics.to_csv('/content/drive/My Drive/sampled_topics.csv', index=True)

# Load Sampled Dataset

In [None]:
documents = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='documents')
topics = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='topics')


# documents = pd.read_csv('/content/drive/My Drive/sampled_documents.csv')
# documents.set_index('index', inplace=True)

# topics = pd.read_csv('/content/drive/My Drive/sampled_topics.csv')
# topics.set_index('index', inplace=True)

# **Filter Dataset**

# Pre-process ag_news

In [None]:
documents['preprocess'] = documents['doc'].apply(ut.tokenize)

# drop preprocessed documents with length less than 6
topics = topics[documents['preprocess'].str.len() > 6]
documents = documents[documents['preprocess'].str.len() > 6]

favorite_topics = [0, 1, 2, 3]

# remove duplicate terms from each document
# documents['preprocess'] = documents['preprocess'].apply(remove_duplicates_terms)

#join preprocced tokens to make a string. used in tf-idf and cosine scoring.
documents['joined_tokens'] = documents['preprocess'].apply(join_tokens)

KeyboardInterrupt: 

In [None]:
test_documents['preprocess'] = test_documents['doc'].apply(ut.tokenize)
test_documents['joined_tokens'] = test_documents['preprocess'].apply(join_tokens)

# Pre-process reuters

In [None]:
# preprocess data by tokenization
documents['preprocess'] = documents['doc'].apply(ut.tokenize)

# drop preprocessed documents with length less than 6
topics = topics[documents['preprocess'].str.len() > 6]
documents = documents[documents['preprocess'].str.len() > 6]

# remove duplicate terms from each document
# documents['preprocess'] = documents['preprocess'].apply(remove_duplicates_terms)

#join preprocced tokens to make a string. used in tf-idf and cosine scoring.
documents['joined_tokens'] = documents['preprocess'].apply(join_tokens)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  documents['joined_tokens'] = documents['preprocess'].apply(join_tokens)


# Hyper-parameters

In [None]:
maximum_length = 100
embedding_dim = 100
num_extra_features = 100
num_classes = len(favorite_topics)

# Coding Labels

In [None]:
# combine all the topics into a list
topics['topics_lst'] = topics.iloc[:, :].apply(lambda row: list(row), axis=1)

# replace topics with their indexes, unfound topics are replaced by zero
topics['topics_lst'] = topics['topics_lst'].apply(replace_with_index)

# Apply the function to column 'Column'
topics['topics_lst'] = topics['topics_lst'].apply(remove_zeros)

# convert labels into a one-hot coding
topics['one_hot'] = [list(np.sum(to_categorical(label, num_classes=num_classes), axis=0)) for label in topics['topics_lst']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topics['topics_lst'] = topics.iloc[:, :].apply(lambda row: list(row), axis=1)


# Coding Labels (AG_NEWS)

In [None]:
# combine all the topics into a list
test_topics['topics_lst'] = test_topics.iloc[:, :].apply(lambda row: list(row), axis=1)

# replace topics with their indexes, unfound topics are replaced by zero
test_topics['topics_lst'] = test_topics['topics_lst'].apply(replace_with_index)

# Apply the function to column 'Column'
test_topics['topics_lst'] = test_topics['topics_lst'].apply(remove_zeros)

# convert labels into a one-hot coding
test_topics['one_hot'] = [list(np.sum(to_categorical(label, num_classes=num_classes), axis=0)) for label in test_topics['topics_lst']]

# **Split Data into Train and Validation, and Test set**

# AG_NEWS

In [None]:
rand = random.randint(10,99)

trainDocs, valDocs, trainTopics, valTopics = train_test_split(documents, topics, test_size=0.2, random_state=rand)
testDocs = test_documents
testTopics = test_topics
print(trainDocs.shape)
print(trainTopics.shape)
print(valDocs.shape)
print(valTopics.shape)
print(testDocs.shape)
print(testTopics.shape)

NameError: name 'test_documents' is not defined

# Reuters

In [None]:
rand = random.randint(10,99)

trainValDocs, testDocs, trainValTopics, testTopics = train_test_split(documents, topics, test_size=0.2, random_state=rand)
trainDocs, valDocs, trainTopics, valTopcis = train_test_split(trainValDocs, trainValTopics, test_size=0.2, random_state=rand)
print(trainDocs.shape)
print(trainTopics.shape)
print(valDocs.shape)
print(valTopcis.shape)
print(testDocs.shape)
print(testTopics.shape)
print("\n_______________________\n")
# Print the count of documents in each category for train, validation, and test sets
for topic in favorite_topics:
    length = len(trainTopics.index[trainTopics.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the train set:", length)
    length = len(valTopcis.index[valTopcis.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the validation set:", length)
    length = len(testTopics.index[testTopics.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the test set:", length)
    print("_______________________")

(1279, 3)
(1279, 18)
(320, 3)
(320, 18)
(400, 3)
(400, 18)

_______________________

Category  acq  counts in the train set: 406
Category  acq  counts in the validation set: 108
Category  acq  counts in the test set: 116
_______________________
Category  corn  counts in the train set: 41
Category  corn  counts in the validation set: 16
Category  corn  counts in the test set: 13
_______________________
Category  crude  counts in the train set: 124
Category  crude  counts in the validation set: 22
Category  crude  counts in the test set: 36
_______________________
Category  earn  counts in the train set: 725
Category  earn  counts in the validation set: 176
Category  earn  counts in the test set: 241
_______________________


# **Save Split Data**

In [None]:
# trainDocs.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='trainDocs', mode='a')
# trainTopics.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='trainTopics', mode='a')
# valDocs.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='valDocs', mode='a')
# valTopcis.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='valTopcis', mode='a')
# testDocs.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='testDocs', mode='a')
# testTopics.to_hdf('/content/drive/My Drive/4topics_experiment.h5', key='testTopics', mode='a')

# trainDocs.to_csv('/content/drive/My Drive/trainDocs.csv', index=True)
# trainTopics.to_csv('/content/drive/My Drive/trainTopics.csv', index=True)
# valDocs.to_csv('/content/drive/My Drive/valDocs.csv', index=True)
# valTopcis.to_csv('/content/drive/My Drive/valTopcis.csv', index=True)
# testDocs.to_csv('/content/drive/My Drive/testDocs.csv', index=True)
# testTopics.to_csv('/content/drive/My Drive/testTopics.csv', index=True)

# **Load Split Data**

In [None]:
trainDocs = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='trainDocs')
trainTopics = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='trainTopics')
valDocs = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='valDocs')
valTopcis = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='valTopcis')
testDocs = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='testDocs')
testTopics = pd.read_hdf('/content/drive/My Drive/4topics_experiment.h5', key='testTopics')




# trainDocs = pd.read_csv('/content/drive/My Drive/trainDocs.csv')
# valDocs = pd.read_csv('/content/drive/My Drive/valDocs.csv')
# testDocs = pd.read_csv('/content/drive/My Drive/testDocs.csv')
# trainTopics = pd.read_csv('/content/drive/My Drive/trainTopics.csv')
# valTopcis = pd.read_csv('/content/drive/My Drive/valTopcis.csv')
# testTopics = pd.read_csv('/content/drive/My Drive/testTopics.csv')

# trainDocs.set_index('index', inplace=True)
# valDocs.set_index('index', inplace=True)
# testDocs.set_index('index', inplace=True)
# trainTopics.set_index('index', inplace=True)
# valTopcis.set_index('index', inplace=True)
# testTopics.set_index('index', inplace=True)

# trainDocs.index = trainDocs.index.astype(str)
# valDocs.index = valDocs.index.astype(str)
# testDocs.index = testDocs.index.astype(str)
# trainTopics.index = trainTopics.index.astype(str)
# valTopcis.index = valTopcis.index.astype(str)
# testTopics.index = testTopics.index.astype(str)

# trainDocs['documents'] = trainDocs['documents'].apply(ast.literal_eval)
# valDocs['documents'] = valDocs['documents'].apply(ast.literal_eval)
# testDocs['documents'] = testDocs['documents'].apply(ast.literal_eval)
# trainTopics['documents'] = trainTopics['documents'].apply(ast.literal_eval)
# valTopcis['documents'] = valTopcis['documents'].apply(ast.literal_eval)
# testTopics['documents'] = testTopics['documents'].apply(ast.literal_eval)


print(trainDocs.shape)
print(trainTopics.shape)
print(valDocs.shape)
print(valTopcis.shape)
print(testDocs.shape)
print(testTopics.shape)
print("\n_______________________\n")
# Print the count of documents in each category for train, validation, and test sets
for topic in favorite_topics:
    length = len(trainTopics.index[trainTopics.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the train set:", length)
    length = len(valTopcis.index[valTopcis.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the validation set:", length)
    length = len(testTopics.index[testTopics.applymap(lambda x: x == topic).any(axis=1)])
    print("Category ", topic, " counts in the test set:", length)
    print("_______________________")

(1279, 3)
(1279, 18)
(320, 3)
(320, 18)
(400, 3)
(400, 18)

_______________________

Category  acq  counts in the train set: 402
Category  acq  counts in the validation set: 100
Category  acq  counts in the test set: 128
_______________________
Category  corn  counts in the train set: 46
Category  corn  counts in the validation set: 11
Category  corn  counts in the test set: 13
_______________________
Category  crude  counts in the train set: 113
Category  crude  counts in the validation set: 31
Category  crude  counts in the test set: 38
_______________________
Category  earn  counts in the train set: 730
Category  earn  counts in the validation set: 182
Category  earn  counts in the test set: 230
_______________________


# **Cosine Similarity Calculation And Vectorization**




**Train Data**

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=white_space_splitter, preprocessor=None, stop_words=None, max_df=1.0, min_df=1, max_features=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(trainDocs['joined_tokens'])

count_vectorizer = CountVectorizer(tokenizer=white_space_splitter, preprocessor=None, stop_words=None, max_df=1.0, min_df=1, max_features=None)
tf_matrix = count_vectorizer.fit_transform(trainDocs['joined_tokens'])

vocab_size = len(count_vectorizer.vocabulary_)

# cosine_sim_score = cosine_similarity(tf_matrix.T)

trainDocs['vectorized'] = trainDocs['preprocess'].apply(lambda lst: vectorize(tfidf_vectorizer.vocabulary_, lst))
# trainDocs['vectorized'] = trainDocs['vectorized'].apply(lambda lst: remove_high_correlated_tokens(cosine_sim_score, lst))



In [None]:
print("Number of tokens before cosine similarity: " + str(vocab_size))
print("Number of tokens after cosine similarity: " + str(get_number_of_tokens(trainDocs['vectorized'])))

Number of tokens before cosine similarity: 7503
Number of tokens after cosine similarity: 7503


**Vectorize Validation Data**

In [None]:
valDocs['vectorized'] = valDocs['preprocess'].apply(lambda lst: vectorize(tfidf_vectorizer.vocabulary_, lst))

# **TF Feature**

**Train**

In [None]:
unique_terms = [token for tokenized_doc in trainDocs['vectorized'] for token in tokenized_doc]
unique_terms = list(set(unique_terms))
train_tf = pd.DataFrame(tf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=trainDocs.index)
trainDocs['tf'] = pd.DataFrame(train_tf.apply(create_list, axis=1)).iloc[:, 0].values

**Validation**

In [None]:
val_tf_matrix = count_vectorizer.transform(valDocs['joined_tokens'])
val_tf = pd.DataFrame(val_tf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=valDocs.index)
valDocs['tf'] = pd.DataFrame(val_tf.apply(create_list, axis=1)).iloc[:, 0].values


# **Terms Dictionary**

**Term Topic Dictionary**

In [None]:
vocab_size = get_number_of_tokens(trainDocs['vectorized'])
token_topic_dict = pd.DataFrame(np.zeros(shape=(vocab_size, len(favorite_topics))), columns=range(len(favorite_topics)), index=unique_terms)

for index in trainTopics.index:
    topics_lst = trainTopics.loc[index]['topics_lst']
    term_vector = trainDocs.loc[index]['vectorized']
    for topic in topics_lst:
        for term in term_vector:
            token_topic_dict.loc[term][topic] += 1

In [None]:
filter_dictionary = input("Filter Dictionary?")
if filter_dictionary in ["Y", "y"]:
    for topic in token_topic_dict.columns:
        col_of_topic = token_topic_dict[topic] != 0
        other_cols = token_topic_dict.drop(columns=topic).eq(0).all(axis=1)

        # Combine the conditions
        condition = col_of_topic & other_cols

        # Mark the rows that meet the condition with 1
        token_topic_dict['mark'] = np.where(condition, 1, 0)
        topic_terms_index = token_topic_dict.sort_values(by=['mark',topic], ascending=False).iloc[100:].index
        token_topic_dict.loc[topic_terms_index, topic] = 0
    token_topic_dict.drop(columns='mark', inplace=True)

Filter Dictionary?N


**Train Term Topic Weights**

In [None]:
trainTTW = pd.DataFrame(None, columns=range(len(favorite_topics)), index=trainDocs.index)

for topic in list(token_topic_dict.columns):
    trainTTW[topic] = trainDocs['vectorized'].apply(lambda lst: [token_topic_dict.loc[term][topic] for term in lst])
trainTTW.shape

(1279, 4)

**Padding**

In [None]:
for col in list(trainTTW.columns):
    trainTTW[col] = trainTTW[col].apply(lambda lst: padding(lst, maximum_length))
trainTTW.shape

(1279, 4)

**Concatinate (Merge) Term Topic Weights**

In [None]:
# Apply the function and create a new column
columns = trainTTW.columns
trainTTW['concatinated'] = trainTTW.apply(lambda lst: concatenate_arrays(columns, lst),  axis=1)

**Validation Term Topic Weights**

In [None]:
valTTW = pd.DataFrame(None, columns=range(len(favorite_topics)), index=valDocs.index)

for topic in list(token_topic_dict.columns):
    valTTW[topic] = valDocs['vectorized'].apply(lambda lst: [token_topic_dict.loc[term][topic] if term in token_topic_dict.index else 0 for term in lst])
valTTW.shape

(320, 4)

**Padding**

In [None]:
for col in list(valTTW.columns):
    valTTW[col] = valTTW[col].apply(lambda lst: padding(lst, maximum_length))
valTTW.shape

(320, 4)

**Concatinate (Merge) Term Topic Weights**

In [None]:
# Apply the function and create a new column
columns = valTTW.columns
valTTW['concatinated'] = valTTW.apply(lambda lst: concatenate_arrays(columns, lst),  axis=1)

# **TF-iDF**

**Train**

In [None]:
# unique terms calculated in the TF part
train_tfidf = pd.DataFrame(tfidf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=trainDocs.index)
trainDocs['tfidf'] = pd.DataFrame(train_tfidf.apply(create_list, axis=1)).iloc[:, 0].values

**Test**

In [None]:
val_tfidf_matrix = tfidf_vectorizer.transform(valDocs['joined_tokens'])
val_tfidf = pd.DataFrame(val_tfidf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=valDocs.index)
valDocs['tfidf'] = pd.DataFrame(val_tfidf.apply(create_list, axis=1)).iloc[:, 0].values


# **PCA**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(trainDocs['vectorized'])
# X_test_scaled = scaler.transform(df_test)

pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

# 4. Examine the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_ratio}')
print(f'Sum of explained variance ratio: {sum(explained_variance_ratio)}')

# Optional: Convert PCA results back to DataFrame for easier handling
df_train_pca = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(X_train_pca.shape[1])])
# df_test_pca = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(X_test_pca.shape[1])])

# Output the transformed data
print(df_train_pca.head())
# print(df_test_pca.head())


ValueError: setting an array element with a sequence.

In [None]:
components = pca.components_

# To find the top n features contributing to each component
n = 10  # Example: top 1 features
selected_features = []
for i, component in enumerate(components):
    # Get indices of the top n absolute loadings
    indices = np.argsort(np.abs(component))[-n:]
    selected_features.append(indices)
    # print(f"Top {n} features for principal component {i+1}: {indices}")

# Optional: Flatten the list if you want a single set of unique feature indices
selected_features_flat = np.unique(np.concatenate(selected_features))
selected_features_flat.shape

NameError: name 'pca' is not defined

In [None]:
df_train_pca

# **Token Tuples**

**Train Data**

In [None]:
# pairs of terms (size 2)
trainDocs['tuple_2'] = trainDocs['vectorized'].apply(lambda lst: make_tuple(lst, 2))
# tuple of size 3
# trainDocs['tuple_3'] = trainDocs['vectorized'].apply(lambda lst: make_tuple(lst, 3))

# get list of unique tokens
unique_token_pairs = get_unique_token_pairs(trainDocs['tuple_2'])
print('Tuple_2 unique tokens: ' + str(len(unique_token_pairs)))

# get list of unique tokens
# unique_token_pairs_3 = get_unique_token_pairs(trainDocs['tuple_3'])
# print('Tuple_3 unique tokens: ' + str(len(unique_token_pairs_3)))

Tuple_2 unique tokens: 47929


**Validation Data**

In [None]:
valDocs['tuple_2'] = valDocs['vectorized'].apply(lambda lst: make_tuple(lst, 2))
# valDocs['tuple_3'] = valDocs['vectorized'].apply(lambda lst: make_tuple(lst, 3))

# **Count the number of Tuples**

**Train Data**

In [None]:
# Tuple Size 2
pair_token_counts = []
for pair in unique_token_pairs:
    pair_token_counts.append(trainDocs['tuple_2'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# create a DataFrame of zeros with the tokens as column names
pair_tokens_df = pd.DataFrame(pair_token_counts).T
pair_tokens_df.index = trainDocs.index
pair_tokens_df.columns = unique_token_pairs

# Tuple Size 3
# pair_3_token_counts = []
# for pair in unique_token_pairs_3:
#     pair_3_token_counts.append(trainDocs['tuple_3'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# create a DataFrame of zeros with the tokens as column names
# pair_3_tokens_df = pd.DataFrame(pair_3_token_counts).T
# pair_3_tokens_df.index = trainDocs.index
# pair_3_tokens_df.columns = unique_token_pairs_3

**Validation Data**

In [None]:
pair_token_counts = []
for pair in unique_token_pairs:
    pair_token_counts.append(valDocs['tuple_2'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# create a DataFrame of zeros with the tokens as column names
val_pair_tokens_df = pd.DataFrame(pair_token_counts).T
val_pair_tokens_df.index = valDocs.index
val_pair_tokens_df.columns = unique_token_pairs


# Tuple Size 3
# pair_3_token_counts = []
# for pair in unique_token_pairs_3:
#     pair_3_token_counts.append(valDocs['tuple_3'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# create a DataFrame of zeros with the tokens as column names
# val_pair_3_tokens_df = pd.DataFrame(pair_3_token_counts).T
# val_pair_3_tokens_df.index = valDocs.index
# val_pair_3_tokens_df.columns = unique_token_pairs_3

# **Feature Selection**

**Train Data**

In [None]:
# Tuple 2
topics_array=trainTopics['topics_lst'].apply(lambda lst: lst[0]).values

selected_pairs = feature_selection_rfe(pair_tokens_df, topics_array)
print("No. of unique tuples-2 (features):" + str(len(selected_pairs)))

train_selected_pairs_df = pair_tokens_df[selected_pairs]
train_selected_pairs_df['joined'] = train_selected_pairs_df.apply(create_list, axis=1)



No. of unique tuples-2 (features):4792


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_selected_pairs_df['joined'] = train_selected_pairs_df.apply(create_list, axis=1)


In [None]:
# tuple 3
# selected_pairs_3 = feature_selection_rfe(pair_3_tokens_df, topics_array)
# print("No. of unique tuples-3 (features):" + str(len(selected_pairs_3)))

# train_selected_pairs_3_df = pair_3_tokens_df[selected_pairs_3]
# train_selected_pairs_3_df['joined'] = train_selected_pairs_3_df.apply(create_list, axis=1)

**Validation Data**

In [None]:
# Tuple 2
val_selected_pairs_df = pd.DataFrame(0, index=valDocs.index, columns=selected_pairs, dtype=int)
for col in selected_pairs:
    if col in val_pair_tokens_df.columns:
        val_selected_pairs_df[col] = val_pair_tokens_df[col]

val_selected_pairs_df['joined'] = val_selected_pairs_df.apply(create_list, axis=1)

  val_selected_pairs_df['joined'] = val_selected_pairs_df.apply(create_list, axis=1)


In [None]:
# Tuple 3
# val_selected_pairs_3_df = pd.DataFrame(0, index=valDocs.index, columns=selected_pairs_3, dtype=int)
# for col in selected_pairs_3:
#     if col in val_pair_3_tokens_df.columns:
#         val_selected_pairs_3_df[col] = val_pair_3_tokens_df[col]

# val_selected_pairs_3_df['joined'] = val_selected_pairs_3_df.apply(create_list, axis=1)

# **POS Tagging**

**Train**

In [None]:
trainDocs['pos_tag'] = trainDocs['preprocess'].apply(lambda lst: lemmatize(lst))

**Validation**

In [None]:
valDocs['pos_tag'] = valDocs['preprocess'].apply(lambda lst: lemmatize(lst))

# **Padding**

**Train Data**

In [None]:
trainDocs['token_padded'] = trainDocs['vectorized'].apply(lambda lst: padding(lst, maximum_length))
trainDocs['pos_padded'] = trainDocs['pos_tag'].apply(lambda lst: padding(lst, maximum_length))

**Validation Data**

In [None]:
valDocs['token_padded'] = valDocs['vectorized'].apply(lambda lst: padding(lst, maximum_length))
valDocs['pos_padded'] = valDocs['pos_tag'].apply(lambda lst: padding(lst, maximum_length))

# **GloVe Embedding**

In [None]:
embedding_matrix = GloVe_embedding(count_vectorizer.vocabulary_, vocab_size, embedding_dim)

# **Test Data Prepration**

**Vectorize Test Data**

In [None]:
testDocs['vectorized'] = testDocs['preprocess'].apply(lambda lst: vectorize(tfidf_vectorizer.vocabulary_, lst))

**Test TF and TF-iDF**

In [None]:
test_tf_matrix = count_vectorizer.transform(testDocs['joined_tokens'])
test_tf = pd.DataFrame(test_tf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=testDocs.index)
testDocs['tf'] = pd.DataFrame(test_tf.apply(create_list, axis=1)).iloc[:, 0].values

In [None]:
test_tfidf_matrix = tfidf_vectorizer.transform(testDocs['joined_tokens'])
test_tfidf = pd.DataFrame(test_tfidf_matrix[:, unique_terms].toarray(), columns=unique_terms, index=testDocs.index)
testDocs['tfidf'] = pd.DataFrame(test_tfidf.apply(create_list, axis=1)).iloc[:, 0].values

**Test Term Dictionary**

In [None]:
testTTW = pd.DataFrame(None, columns=range(len(favorite_topics)), index=testDocs.index)

for topic in list(token_topic_dict.columns):
    testTTW[topic] = testDocs['vectorized'].apply(lambda lst: [token_topic_dict.loc[term][topic] if term in token_topic_dict.index else 0 for term in lst])

for col in list(testTTW.columns):
    testTTW[col] = testTTW[col].apply(lambda lst: padding(lst, maximum_length))

# Apply the function and create a new column
columns = testTTW.columns
testTTW['concatinated'] = testTTW.apply(lambda lst: concatenate_arrays(columns, lst),  axis=1)

**Test Term Pairs**

In [None]:
testDocs['tuple_2'] = testDocs['vectorized'].apply(lambda lst: make_tuple(lst, 2))

pair_token_counts = []
for pair in unique_token_pairs:
    pair_token_counts.append(testDocs['tuple_2'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# create a DataFrame of zeros with the tokens as column names
test_pair_tokens_df = pd.DataFrame(pair_token_counts).T
test_pair_tokens_df.index = testDocs.index
test_pair_tokens_df.columns = unique_token_pairs

In [None]:
# testDocs['tuple_3'] = testDocs['vectorized'].apply(lambda lst: make_tuple(lst, 3))

# pair_3_token_counts = []
# for pair in unique_token_pairs:
#     pair_3_token_counts.append(testDocs['tuple_3'].apply(lambda lst: get_token_pairs_count(pair, lst)).to_numpy())

# # create a DataFrame of zeros with the tokens as column names
# test_pair_3_tokens_df = pd.DataFrame(pair_3_token_counts).T
# test_pair_3_tokens_df.index = testDocs.index
# test_pair_3_tokens_df.columns = unique_token_pairs

In [None]:
test_selected_pairs_df = pd.DataFrame(0, index=testDocs.index, columns=selected_pairs, dtype=int)
for col in selected_pairs:
    if col in test_pair_tokens_df.columns:
        test_selected_pairs_df[col] = test_pair_tokens_df[col]

test_selected_pairs_df['joined'] = test_selected_pairs_df.apply(create_list, axis=1)

  test_selected_pairs_df['joined'] = test_selected_pairs_df.apply(create_list, axis=1)


In [None]:
# test_selected_pairs_3_df = pd.DataFrame(0, index=testDocs.index, columns=selected_pairs_3, dtype=int)
# for col in selected_pairs_3:
#     if col in test_pair_3_tokens_df.columns:
#         test_selected_pairs_3_df[col] = test_pair_3_tokens_df[col]

# test_selected_pairs_3_df['joined'] = test_selected_pairs_3_df.apply(create_list, axis=1)

**POS Tagging Test Data**

In [None]:
testDocs['pos_tag'] = testDocs['preprocess'].apply(lambda lst: lemmatize(lst))

**Padding Test Data**

In [None]:
testDocs['token_padded'] = testDocs['vectorized'].apply(lambda lst: padding(lst, maximum_length))
testDocs['pos_padded'] = testDocs['pos_tag'].apply(lambda lst: padding(lst, maximum_length))

**Dataframe to Array**

# **Convert Features To Arrays**

**Train Data**

In [None]:
trainDocs_X = np.array(trainDocs['token_padded'].tolist())
trainPos_X = np.array(trainDocs['pos_padded'].tolist())
trainTf_X = np.array(trainDocs['tf'].tolist())
trainTfidf_X = np.array(trainDocs['tfidf'].tolist())
trainTermDict_X = np.array(trainTTW['concatinated'].tolist())
trainPairs_X =  np.array(train_selected_pairs_df['joined'].tolist())
# trainPairs_3_X =  np.array(train_selected_pairs_3_df['joined'].tolist())
train_Y = np.array(trainTopics['one_hot'].tolist())

# convert pairs into np.array
# trainDocs_tuple_2 = np.array([np.array(pair) for doc in trainDocs['tuple_2'] for pair in doc])
# trainDocs_tuple_3 = np.array([np.array(pair) for doc in trainDocs['tuple_3'] for pair in doc])

In [None]:
print(trainDocs_X.shape)
print(trainPos_X.shape)
print(trainPairs_X.shape)

(1279, 100)
(1279, 100)
(1279, 4792)


**Validation Data**

In [None]:
valDocs_X = np.array(valDocs['token_padded'].tolist())
valPos_X = np.array(valDocs['pos_padded'].tolist())
valTf_X = np.array(valDocs['tf'].tolist())
valTfidf_X = np.array(valDocs['tfidf'].tolist())
valTermDict_X = np.array(valTTW['concatinated'].tolist())
valPairs_X =  np.array(val_selected_pairs_df['joined'].tolist())
# valPairs_3_X =  np.array(val_selected_pairs_3_df['joined'].tolist())
val_Y = np.array(valTopcis['one_hot'].tolist())

# convert pairs into np.array
# valDocs_tuple_2 = np.array([np.array(pair) for doc in valDocs['tuple_2'] for pair in doc])
# valDocs_tuple_3 = np.array([np.array(pair) for doc in valDocs['tuple_3'] for pair in doc])

**Test Data**

In [None]:
testDocs_X = np.array(testDocs['token_padded'].tolist())
testPos_X = np.array(testDocs['pos_padded'].tolist())
testTf_X = np.array(testDocs['tf'].tolist())
testTfidf_X = np.array(testDocs['tfidf'].tolist())
testTermDict_X = np.array(testTTW['concatinated'].tolist())
testPairs_X =  np.array(test_selected_pairs_df['joined'].tolist())
# testPairs_3_X =  np.array(test_selected_pairs_3_df['joined'].tolist())
test_y = np.array(testTopics['one_hot'].tolist())
label_y = np.argmax(test_y, axis=1)

# **Model**

## CNN Model

In [None]:
def cnn_model():
    text_input = Input(shape=(maximum_length,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)

    # embedding_layer = Embedding(input_dim=vocab_size,
    #                             output_dim=embedding_dim,
    #                             weights=[embedding_matrix],
    #                             input_length=maximum_length,
    #                             trainable=False)(text_input)

    # Convolutional layers for text processing
    conv_layer_1 = Conv1D(filters=256, kernel_size=5, activation='relu')(embedding_layer)
    pooling_layer_1 = MaxPooling1D(pool_size=2)(conv_layer_1)
    conv_layer_2 = Conv1D(filters=128, kernel_size=5, activation='relu')(pooling_layer_1)
    pooling_layer_2 = MaxPooling1D(pool_size=2)(conv_layer_2)
    conv_layer_3 = Conv1D(filters=64, kernel_size=5, activation='relu')(pooling_layer_2)
    pooling_layer_3 = MaxPooling1D(pool_size=2)(conv_layer_3)
    flatten_layer = Flatten()(pooling_layer_3)

    # POS tags input for extra features
    pos_layer_input = Input(shape=(num_extra_features,1))
    pos_conv_layer_1 = Conv1D(filters=64, kernel_size=5, activation='relu')(pos_layer_input)
    pos_pooling_layer_1 = MaxPooling1D(pool_size=2)(pos_conv_layer_1)
    pos_conv_layer_2 = Conv1D(filters=32, kernel_size=5, activation='relu')(pos_pooling_layer_1)
    pos_pooling_layer_2 = MaxPooling1D(pool_size=2)(pos_conv_layer_2)
    pos_flatten_layer = Flatten()(pos_pooling_layer_2)
    # dense_pos_layer = Dense(512, activation='relu')(pos_layer_input)

    num_unique_terms = len(unique_terms)
    tf_layer_input = Input(shape=(num_unique_terms,1))
    tf_conv_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(tf_layer_input)
    tf_pooling_layer_1 = MaxPooling1D(pool_size=2)(tf_conv_layer_1)
    tf_conv_layer_2 = Conv1D(filters=64, kernel_size=5, activation='relu')(tf_pooling_layer_1)
    tf_pooling_layer_2 = MaxPooling1D(pool_size=2)(tf_conv_layer_2)
    tf_conv_layer_3 = Conv1D(filters=32, kernel_size=5, activation='relu')(tf_pooling_layer_2)
    tf_pooling_layer_3 = MaxPooling1D(pool_size=2)(tf_conv_layer_3)
    tf_flatten_layer = Flatten()(tf_pooling_layer_3)
    # dense_tf_layer = Dense(512, activation='relu')(tf_layer_input)

    # num_unique_terms = len(unique_terms)
    # tfidf_layer_input = Input(shape=(num_unique_terms,1))
    # tfidf_conv_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(tfidf_layer_input)
    # tfidf_pooling_layer_1 = MaxPooling1D(pool_size=2)(tfidf_conv_layer_1)
    # tfidf_conv_layer_2 = Conv1D(filters=64, kernel_size=5, activation='relu')(tfidf_pooling_layer_1)
    # tfidf_pooling_layer_2 = MaxPooling1D(pool_size=2)(tfidf_conv_layer_2)
    # tfidf_conv_layer_3 = Conv1D(filters=32, kernel_size=5, activation='relu')(tfidf_pooling_layer_2)
    # tfidf_pooling_layer_3 = MaxPooling1D(pool_size=2)(tfidf_conv_layer_3)
    # tfidf_flatten_layer = Flatten()(tfidf_pooling_layer_3)
    # dense_tfidf_layer = Dense(512, activation='relu')(tfidf_layer_input)

    # Dictionary of terms layer
    # doc_term_dic_length = 4 * maximum_length
    # term_dict_layer_input = Input(shape=(doc_term_dic_length,1))
    # term_conv_layer_1 = Conv1D(filters=256, kernel_size=5, activation='relu')(term_dict_layer_input)
    # term_pooling_layer_1 = MaxPooling1D(pool_size=3)(term_conv_layer_1)
    # term_conv_layer_2 = Conv1D(filters=128, kernel_size=5, activation='relu')(term_pooling_layer_1)
    # term_pooling_layer_2 = MaxPooling1D(pool_size=2)(term_conv_layer_2)
    # term_conv_layer_3 = Conv1D(filters=64, kernel_size=5, activation='relu')(term_pooling_layer_2)
    # term_pooling_layer_3 = MaxPooling1D(pool_size=2)(term_conv_layer_3)
    # term_flatten_layer = Flatten()(term_pooling_layer_3)
    # dense_term_dict_layer = Dense(512, activation='relu')(term_dict_layer_input)

    # pairs input for extra features
    pair_layer_input = Input(shape=(len(selected_pairs),1))
    pair_conv_layer_1 = Conv1D(filters=256, kernel_size=3, activation='relu')(pair_layer_input)
    pair_pooling_layer_1 = MaxPooling1D(pool_size=3)(pair_conv_layer_1)
    pair_conv_layer_2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pair_pooling_layer_1)
    pair_pooling_layer_2 = MaxPooling1D(pool_size=3)(pair_conv_layer_2)
    pair_conv_layer_3 = Conv1D(filters=64, kernel_size=3, activation='relu')(pair_pooling_layer_2)
    pair_pooling_layer_3 = MaxPooling1D(pool_size=3)(pair_conv_layer_3)
    pair_flatten_layer = Flatten()(pair_pooling_layer_3)
    # dense_pair_layer = Dense(512, activation='relu')(pair_flatten_layer)

    merged = concatenate([flatten_layer, pos_flatten_layer, tf_flatten_layer, pair_flatten_layer])

    # Additional layers for further processing
    dense_layer_1 = Dense(128, activation='relu')(merged)
    dropout_layer_1 = Dropout(0.1)(dense_layer_1)
    dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
    dropout_layer_2 = Dropout(0.1)(dense_layer_2)
    dense_layer_3 = Dense(32, activation='relu')(dropout_layer_2)
    output_layer = Dense(num_classes, activation='softmax')(dense_layer_3)

    # Define the model
    model = Model(inputs=[text_input, pos_layer_input, tf_layer_input, pair_layer_input], outputs=output_layer)

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'],)

    return model

## CNN With Repetition

In [None]:
model.summary()

In [None]:
val_accuracies = []
val_precisions = []
val_recalls = []
val_f1_scores = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
f1_scores_micro = []
f1_scores_macro = []

for i in range(10):
    print(f'Iteration {i+1}')
    model = cnn_model()
    model.fit([trainDocs_X, trainPos_X, trainTf_X, trainPairs_X], train_Y, epochs=5, batch_size=64, validation_data=([valDocs_X, valPos_X, valTf_X, valPairs_X], val_Y))
    val_predictions = model.predict([valDocs_X, valPos_X, valTf_X, valPairs_X])
    val_true_labels = np.argmax(val_Y, axis=1)
    val_pred_labels = np.argmax(val_predictions, axis=1)
    val_precisions.append(precision_score(val_true_labels, val_pred_labels, average=None))
    val_recalls.append(recall_score(val_true_labels, val_pred_labels, average=None))
    val_f1_scores.append(f1_score(val_true_labels, val_pred_labels, average=None))
    val_accuracies.append(accuracy_score(val_true_labels, val_pred_labels))

    predictions = pd.DataFrame(model.predict([testDocs_X, testPos_X, testTf_X, testPairs_X]))
    predictions['classes_probs'] = predictions.apply(lambda row: list(row), axis=1)
    predictions['pred_label'] = predictions['classes_probs'].apply(lambda lst: (np.array(lst) >= 0.05).astype(int))

    predicted_y = np.array(predictions['pred_label'].tolist())

    accuracies.append(accuracy_score(test_y, predicted_y))
    precisions.append(precision_score(test_y, predicted_y, average=None))
    recalls.append(recall_score(test_y, predicted_y, average=None))
    f1_scores.append(f1_score(test_y, predicted_y, average=None))
    f1_scores_micro.append(f1_score(test_y, predicted_y, average='micro'))
    f1_scores_macro.append(f1_score(test_y, predicted_y, average='macro'))

Iteration 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Iteration 6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [None]:
print("accuracy:", np.mean(val_accuracies))
print("f1_score:", np.mean(val_f1_scores, axis=0))
print("precision:", np.mean(val_precisions, axis=0))
print("recall:", np.mean(val_recalls, axis=0))
print("\n")
print("test accuracy:", np.mean(accuracies))
print("\n")
print("test f1_score:", np.mean(f1_scores, axis=0))
print("test precision:", np.mean(precisions, axis=0))
print("test recall:", np.mean(recalls, axis=0))
print("\n")
print("f1_score_micro:", np.mean(f1_scores_micro))
print("f1_score_macro:", np.mean(f1_scores_macro))

accuracy: 0.913125
f1_score: [0.88520736 0.75663534 0.87119052 0.94142929]
precision: [0.88758673 0.99       0.88077728 0.9342286 ]
recall: [0.886      0.66363636 0.87241379 0.95      ]


test accuracy: 0.89375


test f1_score: [0.92027779 0.81113527 0.82220716 0.94592888]
test precision: [0.91642717 0.99166667 0.80196482 0.93298032]
test recall: [0.928125   0.73076923 0.86052632 0.96086957]


f1_score_micro: 0.922321564524894
f1_score_macro: 0.8748872760218858


In [None]:
model.save('cnn_exp21.keras')

In [None]:

files.download('cnn_exp17.keras')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## CNN Single Run

In [None]:
text_input = Input(shape=(maximum_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)

# embedding_layer = Embedding(input_dim=vocab_size,
#                             output_dim=embedding_dim,
#                             weights=[embedding_matrix],
#                             input_length=maximum_length,
#                             trainable=False)(text_input)

# Convolutional layers for text processing
conv_layer_1 = Conv1D(filters=256, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer_1 = MaxPooling1D(pool_size=2)(conv_layer_1)
conv_layer_2 = Conv1D(filters=128, kernel_size=5, activation='relu')(pooling_layer_1)
pooling_layer_2 = MaxPooling1D(pool_size=2)(conv_layer_2)
conv_layer_3 = Conv1D(filters=64, kernel_size=5, activation='relu')(pooling_layer_2)
pooling_layer_3 = MaxPooling1D(pool_size=2)(conv_layer_3)
flatten_layer = Flatten()(pooling_layer_3)

# POS tags input for extra features
pos_layer_input = Input(shape=(num_extra_features,))
dense_pos_layer = Dense(512, activation='relu')(pos_layer_input)

# pairs input for extra features
pair_layer_input = Input(shape=(len(selected_pairs),))
dense_pair_layer = Dense(512, activation='relu')(pair_layer_input)

merged = concatenate([flatten_layer, dense_pos_layer, dense_pair_layer])

# Additional layers for further processing
dense_layer_1 = Dense(128, activation='relu')(merged)
dropout_layer_1 = Dropout(0.1)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.1)(dense_layer_2)
dense_layer_3 = Dense(32, activation='relu')(dropout_layer_2)
output_layer = Dense(num_classes, activation='softmax')(dense_layer_3)

# Define the model
model = Model(inputs=[text_input, pos_layer_input, pair_layer_input], outputs=output_layer)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'],)

# Train the model
model.fit([trainDocs_X, trainPos_X, trainPairs_X], train_Y, epochs=5, batch_size=64, validation_data=([valDocs_X, valPos_X, valPairs_X], val_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x792e4ef2f730>

## MLP

In [None]:
# main text input
text_input = Input(shape=(maximum_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(text_input)
lstm_layer = LSTM(units=512)(embedding_layer)

# text_input = Input(shape=(maximum_length,))
# embedding_layer = Embedding(input_dim=vocab_size,
#                             output_dim=embedding_dim,
#                             weights=[embedding_matrix],
#                             input_length=maximum_length,
#                             trainable=False)(text_input)
# lstm_layer = LSTM(units=1024)(embedding_layer)


# POS tags input for extra features
pos_layer_input = Input(shape=(num_extra_features,))
dense_pos_layer = Dense(512, activation='relu')(pos_layer_input)

# TF input for extra features
num_unique_terms = len(unique_terms)
tf_layer_input = Input(shape=(num_unique_terms,))
dense_tf_layer = Dense(512, activation='relu')(tf_layer_input)

# TFiDF input for extra features
num_unique_terms = len(unique_terms)
tfidf_layer_input = Input(shape=(num_unique_terms,))
dense_tfidf_layer = Dense(512, activation='relu')(tfidf_layer_input)

# Dictionary of terms layer
doc_term_dic_length = 4 * maximum_length
term_dict_layer_input = Input(shape=(doc_term_dic_length,))
dense_term_dict_layer = Dense(512, activation='relu')(term_dict_layer_input)

# # pairs input for extra features
pair_layer_input = Input(shape=(len(selected_pairs),))
dense_pair_layer = Dense(512, activation='relu')(pair_layer_input)

# pairs input for extra features
# pair_3_layer_input = Input(shape=(len(selected_pairs_3),))
# dense_pair_3_layer = Dense(512, activation='relu')(pair_3_layer_input)

# # pair terms (tuples size 2) text input
# text_pairs_input = Input(shape=(None,2))
# text_pairs_flatten_layer = Flatten()(text_pairs_input)
# dense_text_pairs_layer = Dense(256, activation='relu')(text_pairs_flatten_layer)

# Merge the outputs of the main text input and auxiliary input
merged = concatenate([lstm_layer, dense_pos_layer, dense_pair_layer])

# Additional layers for further processing
dense_layer_1 = Dense(128, activation='relu')(merged)
dropout_layer_1 = Dropout(0.2)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.2)(dense_layer_2)
dense_layer_3 = Dense(32, activation='relu')(dropout_layer_2)
output_layer = Dense(num_classes, activation='softmax')(dense_layer_3)


# Define the model
model = Model(inputs=[text_input, pos_layer_input, pair_layer_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
# model.fit({'text_input': trainDocs['token_padded'], 'extra_input': trainDocs['pos_padded']},
#           {'output': trainTopics['one_hot']},
#           epochs=10, batch_size=32, validation_split=0.2)
model.fit([trainDocs_X, trainPos_X, trainPairs_X], train_Y, epochs=5, batch_size=64, validation_data=([valDocs_X, valPos_X, valPairs_X], val_Y))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x792e44096140>

In [None]:
inputs_list = [text_input, pos_layer_input, tf_layer_input, tfidf_layer_input, term_dict_layer_input, pair_layer_input]
dense_layers_list = [lstm_layer, dense_pos_layer, dense_tf_layer, dense_tfidf_layer, dense_term_dict_layer, dense_pair_layer]
train_features_list = [trainDocs_X, trainPos_X, trainTf_X, trainTfidf_X, trainTermDict_X, trainPairs_X]
validation_features_list = [valDocs_X, valPos_X, valTf_X, valTfidf_X, valTermDict_X, valPairs_X]
test_features_list = [testDocs_X, testPos_X, testTf_X, testTfidf_X, testTermDict_X, testPairs_X]

# **Evaluate Model**

**Validation**

In [None]:
# val_predictions = model.predict([valTfidf_X, valTermDict_X, valPairs_X])
val_predictions = model.predict([valDocs_X, valPos_X, valPairs_X])

# Convert predictions and true labels to class labels
val_true_labels = np.argmax(val_Y, axis=1)
val_pred_labels = np.argmax(val_predictions, axis=1)

# Calculate precision, recall, and F1 score
precision = precision_score(val_true_labels, val_pred_labels, average=None)
recall = recall_score(val_true_labels, val_pred_labels, average=None)
f1 = f1_score(val_true_labels, val_pred_labels, average=None)

print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

F1 Score: [0.89719626 0.4        0.72131148 0.94285714]
Precision: [0.84210526 0.75       0.6875     0.97058824]
Recall: [0.96       0.27272727 0.75862069 0.91666667]


**Test**

In [None]:
evaluation_results = model.evaluate([testDocs_X, testPos_X, testPairs_X], test_y)
predictions = pd.DataFrame(model.predict([testDocs_X, testPos_X, testPairs_X]))
# predictions = pd.DataFrame(model.predict([testPos_X, testTf_X, testTfidf_X, testTermDict_X, testPairs_X]))
predictions['classes_probs'] = predictions.apply(lambda row: list(row), axis=1)
# predictions['pred_label'] = predictions['classes_probs'].apply(lambda lst: np.argmax(lst))
predictions['pred_label'] = predictions['classes_probs'].apply(lambda lst: (np.array(lst) >= 0.05).astype(int))

predicted_y = np.array(predictions['pred_label'].tolist())

precision_per_class = precision_score(test_y, predicted_y, average=None)
recall_per_class = recall_score(test_y, predicted_y, average=None)
f1score_per_class = f1_score(test_y, predicted_y, average=None)
micro_f_score = f1_score(test_y, predicted_y, average='micro')
macro_f_score = f1_score(test_y, predicted_y, average='macro')


print("Test Loss:", evaluation_results[0])
print("Test Accuracy:", evaluation_results[1])
print("F1-score per class:", f1score_per_class)
print("Precision per class:", precision_per_class)
print("Recall per class:", recall_per_class)
print("Micro F-score:", micro_f_score)
print("Macro F-score:", macro_f_score)

# confusion_mat = confusion_matrix(test_y, predicted_y)
# print("Confusion Matrix:")
# print(confusion_mat)

Test Loss: 0.7407768368721008
Test Accuracy: 0.887499988079071
F1-score per class: [0.8641115  0.41935484 0.53225806 0.95111111]
Precision per class: [0.77987421 0.26530612 0.38372093 0.97272727]
Recall per class: [0.96875    1.         0.86842105 0.93043478]
Micro F-score: 0.8320693391115928
Macro F-score: 0.6917088781486893


# **Save Model**

In [None]:
model.save('NN_Ver3_Exp3.keras')

# **Confussion Matrix**

In [None]:
for i in range(test_y.shape[1]):
    precision_per_class = precision_score(test_y[:, i], predicted_y[:, i], average=None)
    recall_per_class = recall_score(test_y[:, i], predicted_y[:, i], average=None)

    print("Precision per class:", precision_per_class)
    print("Recall per class:", recall_per_class)

    cm = confusion_matrix(test_y[:, i], predicted_y[:, i])
    print(f"Confusion Matrix for label {i}:")
    print(cm)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# F-scores for 6 experiments
f_scores = [0.414, 0.601, 0.72, 0.194, 0.215, 0.3]
experiments = ['Exp1', 'Exp2', 'Exp3', 'Exp4', 'Exp5', 'Exp6']

# Create a bar plot
fig, ax = plt.subplots()
bars = ax.bar(experiments, f_scores, color=['blue', 'green', 'red', 'purple', 'orange', 'cyan'])

# Add labels and title
ax.set_xlabel('Experiment')
ax.set_ylabel('F-score')
ax.set_title('F-scores of Different Experiments')

# Create a legend
legend_labels = [f'Experiment {i+1}' for i in range(len(experiments))]
ax.legend(bars, legend_labels)

# Display the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# F-scores for 6 experiments (main columns)
f_scores = values = [0.39, 0.38, 0.68, 0.49, 0.62, 0.55]

experiments = ['Exp1', 'Exp2', 'Exp3', 'Exp4', 'Exp5', 'Exp6']

# Sub-columns data for each experiment
sub_scores = [
    [0.51, 0.1, 0.26, 0.70],
    [0.54, 0.0, 0.26, 0.72],
    [0.79, 0.55, 0.47, 0.92],
    [0.64, 0.0, 0.5, 0.80],
    [0.70, 0.47, 0.37, 0.93],
    [0.71, 0.47, 0.27, 0.77],
]

# Define width of bars
bar_width = 0.1

# Define positions for bars
main_bar_positions = np.arange(len(experiments))
sub_bar_positions = [main_bar_positions + bar_width * (i + 1) for i in range(4)]

# Create the plot
fig, ax = plt.subplots()

# Plot main columns (background columns)
main_bars = ax.bar(main_bar_positions, f_scores, color='grey', width=bar_width*4, label='Macro F-score')

my_list = np.array([0, 1, 2, 3])

# Plot sub-columns
for i, sub_score in enumerate(sub_scores):
    if i == 0:
        sub_bars = ax.bar(main_bar_positions[i] + (bar_width * my_list), sub_score, width=bar_width, label=['acq', 'corn', 'crude', 'earn'], color=['green', 'red', 'purple', 'orange'])
    else:
        sub_bars = ax.bar(main_bar_positions[i] + (bar_width * my_list), sub_score, width=bar_width, color=['green', 'red', 'purple', 'orange'])

# Add labels and title
ax.set_xlabel('Experiment')
ax.set_ylabel('F-score')
ax.set_title('F-scores of Different Experiments')
ax.set_xticks(main_bar_positions + bar_width * 1.5)
ax.set_xticklabels(experiments)

# Create a legend
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.savefig('experiment_f_scores_NN4_3features.png', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()



# **Coherence**

In [None]:
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

p_topics = np.argmax(predicted_y == 1, axis=1)
p_topics = [[item] for item in p_topics]

c_dictionary = Dictionary(testDocs['preprocess'])
# c_dictionary

c_corpus = [c_dictionary.doc2bow(doc) for doc in testDocs['preprocess']]
# c_corpus

# Create CoherenceModel
cm = CoherenceModel(topics=p_topics, corpus=c_corpus, dictionary=c_dictionary, coherence='u_mass')
cm.get_coherence_per_topic()
# Get coherence value
# coherence = cm.get_coherence()

# **Perplexity**

In [None]:
from sklearn.metrics import log_loss

pred_prob = predictions['classes_probs'].to_list()
true_y = [[sublist[0]] for sublist in testTopics['topics_lst']]
cross_entropy_loss = log_loss(np.array(true_y), np.array(pred_prob))
perplexity = np.exp(cross_entropy_loss)
perplexity

In [None]:
np.array(pred_prob)

In [None]:
# word_topic = np.zeros(shape=(vocab_size, len(favorite_topics)))

# for doc_index in trainDocs.index:
#     topic_list = trainTopics.loc[doc_index]['topics_lst']
#     token_list = trainDocs.loc[doc_index]['vectorized']
#     for token in token_list:
#         for topic in topic_list:
#             word_topic[token, topic] += 1

# predicted_y

# testDocs['preprocess']

# import gensim
# from gensim.corpora.dictionary import Dictionary
# from gensim.models.coherencemodel import CoherenceModel
# c_dictionary = Dictionary(testDocs['preprocess'])

# c_corpus = [c_dictionary.doc2bow(doc) for doc in testDocs['preprocess']]
# c_corpus

# Create CoherenceModel
# cm = CoherenceModel(topics=list_of_lists, corpus=c_corpus, dictionary=c_dictionary, coherence='u_mass')

# # Get coherence value
coherence = cm.get_coherence()

# a = np.argmax(predicted_y == 1, axis=1)
# list_of_lists = [[item] for item in a]
# list_of_lists
coherence

In [None]:
list_of_lists

# **GloVe Test Section**

--2024-06-24 09:57:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-24 09:57:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-24 09:57:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# 2. Load GloVe embeddings


In [None]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(['My name is Ali Hossein new all you can do call me john PC security. I love Ali'])
tokenizer.texts_to_sequences(['My name is Ali'])


[[2, 3, 4, 1]]

In [None]:
tokenizer.word_index

{'ali': 1,
 'my': 2,
 'name': 3,
 'is': 4,
 'hossein': 5,
 'new': 6,
 'all': 7,
 'you': 8,
 'can': 9,
 'do': 10,
 'call': 11,
 'me': 12,
 'john': 13,
 'pc': 14,
 'security': 15,
 'i': 16,
 'love': 17}

In [None]:
embedding_matrix.shape

(7503, 100)

In [None]:

text_input = Input(shape=(maximum_length,))
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=maximum_length,
                            trainable=False)(text_input)
lstm_layer = LSTM(units=512)(embedding_layer)

# POS tags input for extra features
pos_layer_input = Input(shape=(num_extra_features,))
dense_pos_layer = Dense(512, activation='relu')(pos_layer_input)

# TF input for extra features
tf_layer_input = Input(shape=(num_unique_terms,))
dense_tf_layer = Dense(512, activation='relu')(tf_layer_input)

# TFiDF input for extra features
tfidf_layer_input = Input(shape=(num_unique_terms,))
dense_tfidf_layer = Dense(512, activation='relu')(tfidf_layer_input)

# Dictionary of terms layer
term_dict_layer_input = Input(shape=(doc_term_dic_length,))
dense_term_dict_layer = Dense(512, activation='relu')(term_dict_layer_input)

# Pairs input for extra features
pair_layer_input = Input(shape=(len(selected_pairs),))
dense_pair_layer = Dense(512, activation='relu')(pair_layer_input)

# Merge the outputs of the main text input and auxiliary input
merged = concatenate([lstm_layer, dense_pos_layer, dense_tf_layer, dense_tfidf_layer, dense_term_dict_layer, dense_pair_layer])

# Additional layers for further processing
dense_layer_1 = Dense(128, activation='relu')(merged)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)
dense_layer_3 = Dense(32, activation='relu')(dense_layer_2)
output_layer = Dense(num_classes, activation='softmax')(dense_layer_3)

# Define the model
model = Model(inputs=[text_input, pos_layer_input, tf_layer_input, tfidf_layer_input, term_dict_layer_input, pair_layer_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([trainDocs_X, trainPos_X, trainTf_X, trainTfidf_X, trainTermDict_X, trainPairs_X], train_Y, epochs=10, batch_size=32, validation_data=([valDocs_X, valPos_X, valTf_X, valTfidf_X, valTermDict_X, valPairs_X], val_Y))


NameError: name 'num_unique_terms' is not defined