In [1]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim import corpora, models
from torch_geometric.nn import GCNConv
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from torch_geometric.utils import from_networkx
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import string
import random
import torch_geometric
import os
import time
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

  from .autonotebook import tqdm as notebook_tqdm


# Tokenization, stopword removal, and stemming or lemmatization

In [2]:
# Load your dataset
data = pd.read_csv('input_data.csv')
original_data_with_topics = data.copy()

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    #stop_words = set(stopwords.words('english'))
    #tokens = [token for token in tokens if token not in stop_words]

    # Define a list of nonsense words
    nonsense_words = ["oh", "umm", "oops", "hi", "lol", "rofl", "lmao", "wtf", "omg", 
                     "ok", "right", "uh", "huh", "yep", "ohh", "hmm", "ah", "god", "shit", 
                      "like", "say", "oop", "yeah", "yes", "xxxx", "ca", "na", "ohhh", "yo", "wow", "whoa", 
                      "shit", "sucking", "uhh", "inaudible"]

    # Remove nonsense words
    tokens = [token for token in tokens if token not in nonsense_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Apply the preprocessing function to the "All_Content" column
data['Processed_Content'] = data['All_Content'].apply(preprocess_text)

# filter the rows that have an empty list in col1
rows_to_drop = data['Processed_Content'].apply(lambda x: len(x) == 0)
dropped_indices = data[rows_to_drop].index

# drop the rows that meet the condition
data = data[~rows_to_drop]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/percyjardine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/percyjardine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/percyjardine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Find word frequency**

In [3]:
# Step 1: Concatenate all the strings in the "Processed_Content" column into one long string
all_content = ' '.join(data['Processed_Content'].explode())

# Step 2: Convert the long string to lowercase
all_content = all_content.lower()

# Step 3: Remove all punctuation and non-alphanumeric characters
translator = str.maketrans('', '', string.punctuation + '’')
all_content = all_content.translate(translator)

# Step 4: Split the long string into individual words
words = all_content.split()

# Step 5: Count the frequency of each word
word_counts = Counter(words)

# Print the 100 most common words and their frequencies
print(word_counts.most_common(100))

[('the', 2848), ('i', 1287), ('so', 1282), ('it', 1024), ('is', 966), ('that', 928), ('and', 905), ('to', 854), ('wa', 743), ('we', 712), ('a', 622), ('have', 619), ('this', 614), ('you', 554), ('kelley', 516), ('he', 497), ('do', 476), ('at', 471), ('no', 452), ('what', 451), ('one', 446), ('in', 432), ('there', 408), ('but', 406), ('smith', 382), ('scott', 376), ('think', 345), ('of', 341), ('on', 333), ('can', 307), ('apartment', 306), ('jones', 278), ('then', 277), ('just', 271), ('here', 266), ('key', 256), ('who', 251), ('because', 236), ('not', 228), ('from', 213), ('found', 213), ('miss', 211), ('knife', 203), ('ellington', 197), ('time', 195), ('with', 189), ('for', 187), ('had', 180), ('be', 178), ('they', 176), ('margaret', 169), ('know', 167), ('all', 166), ('elwood', 166), ('she', 161), ('about', 160), ('maybe', 157), ('did', 153), ('bank', 152), ('are', 151), ('when', 146), ('or', 145), ('my', 142), ('ha', 141), ('person', 139), ('president', 138), ('well', 137), ('howard

In [4]:
data

Unnamed: 0,PlayerID,GroupID,Condition,All_Content,information,grounds,claim,organization,query,social,strategy,Processed_Content
0,3,1,Desktop,Alright. Is that OK that we can hear you guys ...,0,0,0,0,1,0,0,"[alright, is, that, that, we, can, hear, you, ..."
2,7,1,Desktop,This is overwhelming.,0,0,0,0,0,0,0,"[this, is, overwhelming]"
3,9,1,Desktop,Oh wow. This is. So.,0,0,0,0,0,0,0,"[this, is, so]"
4,3,1,Desktop,So do we.,0,0,0,0,0,0,0,"[so, do, we]"
5,9,1,Desktop,I think all you could do 'cause I mean this is...,0,0,0,1,1,0,1,"[i, think, all, you, could, do, i, mean, this,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6175,2,7,Desktop,Janitor.,0,0,0,0,0,0,0,[janitor]
6176,5,7,Desktop,Who supplied the weapon?,0,0,1,0,1,0,0,"[who, supplied, the, weapon]"
6177,2,7,Desktop,The first note. You can see the first...,0,0,0,0,0,0,0,"[the, first, note, you, can, see, the, first]"
6179,4,7,Desktop,Jaguar.,0,0,0,0,0,0,0,[jaguar]


# Construct a document-term matrix

**To calculate the TF-IDF weight for a term in a document (cell (i, j) in the matrix), we need to compute the Term Frequency (TF) and Inverse Document Frequency (IDF) values and then multiply them together. Here's a step-by-step explanation of how to do this:**

TF(i, j) = (Number of times term j appears in document i) / (Total number of terms in document i)

IDF(j) = log( (Total number of documents) / (Number of documents containing term j) )

**Calculate TF-IDF weight for term j in document i, the TF-IDF weight is obtained by multiplying the TF and IDF values:**

TF-IDF(i, j) = TF(i, j) * IDF(j)3

**The resulting TF-IDF weight captures the term's importance within the individual document and across the entire collection of documents.**

**1. n grams tokenization**

In [5]:
def custom_tokenizer(text):
    return text.split()

def custom_preprocessor(text):
    return text

In [6]:
# Convert the 'Processed_Content' column back to string format
data['Processed_Content_String'] = data['Processed_Content'].apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer
#By setting the ngram_range parameter to (1, 2), 
#the vectorizer will consider both single words (uni-grams) and bi-grams. 
#If you want to include tri-grams as well, you can set the ngram_range to (1, 3)
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, preprocessor=custom_preprocessor, ngram_range=(2, 3))
document_term_matrix = vectorizer.fit_transform(data['Processed_Content_String'])
document_term_matrix_df = pd.DataFrame(document_term_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Save the document-term matrix to a CSV file
document_term_matrix_df.to_csv('document_term_matrix.csv', index=False)



**2. uni-gram tokenization**

In [7]:
# Convert the 'Processed_Content' column back to string format
#data['Processed_Content_String'] = data['Processed_Content'].apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer
#vectorizer = TfidfVectorizer()
#document_term_matrix = vectorizer.fit_transform(data['Processed_Content_String'])

# Save the document-term matrix and feature names to a dataframe
#document_term_matrix_df = pd.DataFrame(document_term_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Save the document-term matrix to a CSV file
#document_term_matrix_df.to_csv('document_term_matrix.csv', index=False)

# Desktop or VR condition

In [8]:
scaling_factor = 10
data['Condition_Scaled'] = data['Condition'].apply(lambda x: scaling_factor if x == 'VR' else 0)
document_term_matrix_df['Condition_Scaled'] = data['Condition_Scaled'].values

In [9]:
np.unique(document_term_matrix_df['Condition_Scaled'])

array([ 0, 10])

In [10]:
document_term_matrix_df

Unnamed: 0,a a,a a clear,a a concept,a a for,a a grudge,a a hippie,a a kind,a a loose,a agnes,a agnes can,...,yours is,yup he,yup he ha,yuwei albert,yuwei albert greenbag,yuwei shui,yuwei shui anything,zoom please,zoom please no,Condition_Scaled
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# Compute the document similarity and create a document graph.

It is reasonable to calculate the cosine similarity for the TF-IDF matrix. In fact, it is a common technique in text mining and information retrieval to measure the similarity between documents based on their term frequency-inverse document frequency (TF-IDF) vectors.

TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. By calculating the cosine similarity between the TF-IDF vectors of documents, we can measure how similar the documents are in terms of their word usage and importance.

Cosine similarity is particularly well-suited for this task because it normalizes the vectors before calculating the similarity, making it less sensitive to the length of the documents. This means that even if two documents have different lengths but use similar words with similar importance, their cosine similarity will still be high.

By computing the cosine similarity of the TF-IDF matrix and creating a graph based on this similarity, we can capture the relationships between the documents in the dataset and use these relationships for tasks such as clustering, classification, or generating embeddings with graph neural networks.

In [11]:
# Create a new DataFrame that only contains the text features
text_features_df = document_term_matrix_df.drop(columns=["Condition_Scaled"])

# Compute document similarity using cosine similarity
similarity_matrix = cosine_similarity(text_features_df)

# Threshold the similarity matrix to create adjacency matrix
threshold = 0.1
adjacency_matrix = (similarity_matrix > threshold).astype(int)

#document_graph = nx.from_numpy_matrix(adjacency_matrix) for networkx below 3.0
document_graph = nx.DiGraph(adjacency_matrix)

# Train a Graph Convolutional Network (GCN) on the document graph.

In [12]:
def graph_contrastive_loss(out, edge_index, neg_sampling_ratio=5, margin=1.0):
    pos_loss = F.mse_loss(out[edge_index[0]], out[edge_index[1]])

    num_nodes = out.shape[0]
    neg_indices = torch.randint(0, num_nodes, (2, neg_sampling_ratio * edge_index.shape[1]))
    neg_loss = F.relu(margin - F.mse_loss(out[neg_indices[0]], out[neg_indices[1]], reduction='none')).mean()

    return pos_loss + neg_loss


In [13]:
start_time = time.time()
# Add the set_seed function
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

#The purpose of using both GCN and GAT is to leverage the strengths of both methods: 
#GCN for local graph structure and 
#GAT for capturing more global structure through attention mechanisms.
class ComplexTopicGNN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(ComplexTopicGNN, self).__init__()
        self.gcn1 = GCNConv(num_features, hidden_channels)
        self.gcn2 = GCNConv(hidden_channels, hidden_channels)
        self.gat1 = GATConv(hidden_channels, hidden_channels, heads=2)
        self.gat2 = GATConv(hidden_channels * 2, hidden_channels, heads=1)
        self.lin1 = torch.nn.Linear(1, 32)
        self.lin2 = torch.nn.Linear(hidden_channels + 32, 32)

    def forward(self, data):
        x, edge_index = data.x[:, :-1], data.edge_index
        condition = data.x[:, -1].view(-1, 1)
        
        x = self.gcn1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gcn2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gat2(x, edge_index)
        x = F.elu(x)
        
        condition = self.lin1(condition)
        condition = F.relu(condition)

        x = torch.cat((x, condition), dim=1)
        x = self.lin2(x)
        return x


# Set the seed value
seed = 42
set_seed(seed)
torch_geometric.seed_everything(seed)

# Convert the document graph to PyTorch Geometric format
pyg_data = from_networkx(document_graph)
pyg_data.x = torch.tensor(document_term_matrix_df.to_numpy(), dtype=torch.float)

# Initialize and train the GCN model
num_features = document_term_matrix_df.shape[1] - 1
hidden_channels = 32
model = ComplexTopicGNN(num_features, hidden_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 1000

model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    out = model(pyg_data)
    loss = graph_contrastive_loss(out, pyg_data.edge_index)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

document_embeddings = out.detach().numpy()
end_time = time.time()

total_time = end_time - start_time
print(f"Execution time: {total_time} seconds")

Epoch: 10, Loss: 0.9384158849716187
Epoch: 20, Loss: 0.5793344974517822
Epoch: 30, Loss: 0.4834437966346741
Epoch: 40, Loss: 0.4169653654098511
Epoch: 50, Loss: 0.39129966497421265
Epoch: 60, Loss: 0.3799734115600586
Epoch: 70, Loss: 0.3714906871318817
Epoch: 80, Loss: 0.3686862885951996
Epoch: 90, Loss: 0.366213858127594
Epoch: 100, Loss: 0.3636719882488251
Epoch: 110, Loss: 0.36402755975723267
Epoch: 120, Loss: 0.3605881333351135
Epoch: 130, Loss: 0.3610188364982605
Epoch: 140, Loss: 0.36048227548599243
Epoch: 150, Loss: 0.3612399399280548
Epoch: 160, Loss: 0.3616284132003784
Epoch: 170, Loss: 0.3650686740875244
Epoch: 180, Loss: 0.3596373498439789
Epoch: 190, Loss: 0.35949674248695374
Epoch: 200, Loss: 0.35949498414993286
Epoch: 210, Loss: 0.35819873213768005
Epoch: 220, Loss: 0.3595665693283081
Epoch: 230, Loss: 0.35817408561706543
Epoch: 240, Loss: 0.3576695919036865
Epoch: 250, Loss: 0.35983380675315857
Epoch: 260, Loss: 0.35836243629455566
Epoch: 270, Loss: 0.3577459454536438
Ep

In [14]:
pd.DataFrame(document_embeddings)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.405627,-0.742578,-0.557294,0.452803,-0.403072,0.444927,-0.460850,0.656498,0.883746,-0.286310,...,-0.559714,-0.741436,-0.546874,0.365763,-0.668594,0.632117,-0.454748,0.553281,0.182276,0.523110
1,-1.551400,1.180257,1.398181,-1.498776,1.541236,-1.498615,1.490072,-1.300453,-1.044214,1.659183,...,1.401595,1.197007,1.374483,-1.605782,1.254378,-1.310816,1.499511,-1.371670,-1.765511,-1.434016
2,-1.556977,1.185761,1.401792,-1.504025,1.546108,-1.503773,1.498381,-1.307976,-1.052049,1.670102,...,1.411865,1.201715,1.385018,-1.613614,1.262675,-1.316286,1.505426,-1.378521,-1.773341,-1.440802
3,-0.926496,0.550716,0.763938,-0.873119,0.914447,-0.873556,0.872850,-0.680091,-0.426285,1.036576,...,0.792013,0.571362,0.765484,-0.985919,0.638078,-0.690404,0.874788,-0.748800,-1.137492,-0.811212
4,-0.467464,0.094065,0.301118,-0.407293,0.455923,-0.415230,0.413753,-0.211734,0.034085,0.581195,...,0.319913,0.111130,0.304105,-0.524146,0.180577,-0.225671,0.412049,-0.296182,-0.676247,-0.349554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5091,160.744278,-158.828491,-158.155151,164.662231,-156.939972,163.852829,-160.626160,158.123398,166.157516,-154.670151,...,-160.017731,-160.766174,-160.700165,163.499863,-159.851578,163.155014,-159.389664,165.553452,158.373245,163.565216
5092,-1.587602,1.216832,1.429350,-1.533701,1.586113,-1.535380,1.538576,-1.338516,-1.083363,1.693159,...,1.445041,1.229995,1.420104,-1.648054,1.294230,-1.350342,1.537922,-1.415547,-1.804613,-1.467561
5093,-1.137762,0.769868,0.975001,-1.078286,1.135165,-1.090626,1.081932,-0.888241,-0.632687,1.249537,...,0.990389,0.783504,0.960751,-1.192420,0.853093,-0.900235,1.083566,-0.962804,-1.348858,-1.019791
5094,63.806870,-63.039883,-63.083248,65.041130,-63.840302,65.188713,-63.135902,64.691444,64.989906,-60.648033,...,-64.074181,-65.794075,-62.474041,64.010307,-63.755432,66.183952,-63.627411,64.466545,63.253677,65.532646


In [15]:
#torch.save(model.state_dict(), 'model_bi_trigram_with_stopword.pth')

# Clustering

**Using the ComplexTopicGNN class to learn the document embeddings and then applying a clustering algorithm to get the topics has several advantages:**

Exploitation of graph structure: By converting the document-term matrix into a graph and using a graph neural network, you can take advantage of the graph structure present in the data. This can help reveal hidden relationships between documents that are not captured by traditional topic modeling methods like LDA or NMF.

Combining GCN and GAT: The ComplexTopicGNN class combines Graph Convolutional Networks (GCN) and Graph Attention Networks (GAT). GCN excels at capturing local graph structure, while GAT can capture more global structure through attention mechanisms. By combining both methods, the model can capture various aspects of the graph structure and learn more informative document embeddings.

Flexibility in clustering algorithms: After learning the document embeddings, you can choose from various clustering algorithms (e.g., KMeans, DBSCAN, hierarchical clustering) to suit the specific characteristics of your dataset. This flexibility allows you to experiment with different clustering methods and select the one that best captures the topics in your data.

Interpretability: By using a clustering algorithm to group the document embeddings, you can identify the top terms associated with each cluster (topic). These top terms provide an interpretable summary of each topic and facilitate understanding of the themes present in the dataset.

Adaptability: The ComplexTopicGNN model can be easily adapted to incorporate additional information or features in the document graph (e.g., metadata, document similarity measures, or node attributes). This adaptability allows you to tailor the model to the specific needs of your dataset and problem domain.

Scalability: Graph neural networks can be efficiently parallelized on GPUs, enabling scalability to large datasets. This makes the ComplexTopicGNN model suitable for processing large-scale document collections.

**Graph neural networks (GNNs) and Latent Dirichlet Allocation (LDA) are different approaches to extracting information from document collections. GNNs can capture relationships that LDA might miss due to their unique way of processing and representing data.**

Document relationships: GNNs operate on graph structures, making them well-suited for capturing relationships between documents. These relationships can be based on various factors, such as document similarity. LDA, on the other hand, does not explicitly model relationships between documents, focusing only on the distribution of topics in each document.

Heterogeneous information: GNNs can easily incorporate additional information in the form of node and edge attributes, which allows for richer representations and the ability to capture relationships based on diverse types of information. In contrast, LDA only considers the words in a document and their associated probabilities.

Higher-order dependencies: GNNs can capture higher-order dependencies between documents, as they iteratively aggregate information from neighboring nodes in the graph. This ability allows GNNs to capture global patterns in the data. LDA is a generative model that assumes independence between documents, limiting its ability to capture higher-order dependencies.

Non-linear relationships: GNNs use non-linear activation functions, which enable them to capture complex, non-linear relationships between documents. LDA is a linear generative model, which might miss certain non-linear relationships in the data.

End-to-end learning: GNNs learn document embeddings in an end-to-end fashion, optimizing the embeddings based on the overall objective function (e.g., graph contrastive loss). This approach can lead to more meaningful embeddings that capture the desired relationships. LDA, on the other hand, estimates topic distributions using a generative process, which may not result in embeddings that are as tailored to the specific relationships of interest.

While GNNs can capture relationships that LDA might miss, it's essential to note that GNNs require a suitable graph representation of the document collection to work effectively. Constructing a meaningful graph representation of the documents is a critical step that influences the performance of the GNN model.

**Load GNN model**

In [16]:
class ComplexTopicGNN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(ComplexTopicGNN, self).__init__()
        self.gcn1 = GCNConv(num_features, hidden_channels)
        self.gcn2 = GCNConv(hidden_channels, hidden_channels)
        self.gat1 = GATConv(hidden_channels, hidden_channels, heads=2)
        self.gat2 = GATConv(hidden_channels * 2, hidden_channels, heads=1)
        self.lin1 = torch.nn.Linear(1, 32)
        self.lin2 = torch.nn.Linear(hidden_channels + 32, 32)

    def forward(self, data):
        x, edge_index = data.x[:, :-1], data.edge_index
        condition = data.x[:, -1].view(-1, 1)
        
        x = self.gcn1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gcn2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.gat2(x, edge_index)
        x = F.elu(x)
        
        condition = self.lin1(condition)
        condition = F.relu(condition)

        x = torch.cat((x, condition), dim=1)
        x = self.lin2(x)
        return x




# Convert the document graph to PyTorch Geometric format
pyg_data = from_networkx(document_graph)
pyg_data.x = torch.tensor(document_term_matrix_df.to_numpy(), dtype=torch.float)

# Initialize and train the GCN model
num_features = document_term_matrix_df.shape[1] - 1
hidden_channels = 32
model_loaded = ComplexTopicGNN(num_features, hidden_channels)
# load the model back into memory
model_loaded.load_state_dict(torch.load('model_bi_trigram_with_stopword.pth'))
# Set the model to evaluation mode
model_loaded.eval()
# Compute the document embeddings
out = model_loaded(pyg_data)
document_embeddings = out.detach().numpy()

In [41]:
document_embeddings

array([[ 1.3232720e-01, -4.8410141e-01, -3.0077836e-01, ...,
         2.9833424e-01, -7.0594363e-02,  2.5511205e-01],
       [-1.6437614e+00,  1.2717584e+00,  1.4814705e+00, ...,
        -1.4708718e+00, -1.8608699e+00, -1.5267580e+00],
       [-1.6212354e+00,  1.2492424e+00,  1.4586468e+00, ...,
        -1.4478384e+00, -1.8377434e+00, -1.5046667e+00],
       ...,
       [-1.1730410e+00,  8.0557883e-01,  1.0111153e+00, ...,
        -9.9792367e-01, -1.3840590e+00, -1.0571390e+00],
       [ 9.7656242e+01, -9.6075500e+01, -9.6183914e+01, ...,
         1.0018219e+02,  9.8030045e+01,  1.0021991e+02],
       [ 9.7656242e+01, -9.6075500e+01, -9.6183914e+01, ...,
         1.0018219e+02,  9.8030045e+01,  1.0021991e+02]], dtype=float32)

# 1. Clusted by KMeans

In [40]:
num_clusters = 7
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(document_embeddings)

num_top_terms = 10
top_terms = {}

for cluster in range(num_clusters):
    # Get the indices of the documents in the current cluster
    doc_indices = np.where(cluster_labels == cluster)[0]
    
    # Compute the centroid of the document-term vectors in the current cluster
    centroid = np.mean(document_term_matrix_df.iloc[doc_indices, :-1], axis=0)
    
    # Get the indices of the top terms for the current cluster
    top_term_indices = centroid.argsort()[-num_top_terms:][::-1]
    
    # Get the top terms for the current cluster
    top_terms[cluster] = [document_term_matrix_df.columns[idx] for idx in top_term_indices]

for cluster, terms in top_terms.items():
    print(f"Topic {cluster}: {', '.join(terms)}")



Topic 0: i think, miss smith, i have, elwood smith, the key, this one, to the, i do, is the, this is
Topic 1: crime analyst, it am, no hello, the unit, uhm so, i gone, so apartment, appeared once, sorry teresa, wordsmith and
Topic 2: he inaudbile, look there, eleven pm, when sorry, i bullet, and paper, there wa nothing, wa nothing, just keep together, keep together
Topic 3: scott scott, wait i, shui yuwei, bird watch, seeing examiner, smith wait, margarita please, another extension, who jone, jones outside
Topic 4: it went, asana please, without making, remove program, something made, another connection, kelley hmmm, or should, look fine, hippie definitely
Topic 5: spritz is, change size up, up your panel, up your, spritz is to, inform same, your panel, same size, this spritz is, this spritz
Topic 6: cut nigel, this card, make this card, you take margaret, you take, cheating business there, name silent, silent anything, greenberg green greenberg, with blood


# 2. Clusted by hierarchical clustering

In [54]:
# Perform hierarchical clustering on the document embeddings
linkage_matrix = linkage(document_embeddings, method='ward')

# Set the number of clusters (topics)
num_clusters = 7

# Assign the cluster labels to the documents
cluster_labels = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

# Extract the top terms for each cluster (topic)
num_top_terms = 10
top_terms = {}

for cluster in range(1, num_clusters + 1):  # Cluster labels start from 1
    # Get the indices of the documents in the current cluster
    doc_indices = np.where(cluster_labels == cluster)[0]

    # Compute the centroid of the document-term vectors in the current cluster
    centroid = np.mean(document_term_matrix_df.iloc[doc_indices, :-1], axis=0)

    # Get the indices of the top terms for the current cluster
    top_term_indices = centroid.argsort()[-num_top_terms:][::-1]

    # Get the top terms for the current cluster
    top_terms[cluster] = [document_term_matrix_df.columns[idx] for idx in top_term_indices]

# Print the top terms for each cluster (topic)
for cluster, terms in top_terms.items():
    print(f"Topic {cluster}: {', '.join(terms)}")

Topic 1: connect sentence, may may, sound good, charge charge, think six, think drag, one question, got nothing, grab panel, get away
Topic 2: miss smith, elwood smith, scott apartment, front door, howard ellington, margaret ellington, kelley wife, body found, albert greenbags, key front
Topic 3: sorry hear, nice knife, one guy, wait look, keep going, lying lying, lying business, okay okay, explain rationale, thinking maybe
Topic 4: teresa green, albert greenback, asana please, bank highlighted, easy delete, feel dizzy, think someone, something happened, guy president, sure sure
Topic 5: wife wife, sorry president, lot thing, finger finger, note find, think thursday, closer ohio, duplicated one, role police, aldric disappeared
Topic 6: four five, making sure, focus four, focus four five, time scott, left apartment, sure wait, making sure wait, wait overwhelming, sure wait overwhelming
Topic 7: crime analyst, name already, give second, created one, kelley minute, sachini created one, sa

# 3. Baseline model by Latent Dirichlet Allocation (LDA)

In [65]:
start_time = time.time()
num_topics = 7
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(document_term_matrix_df.iloc[:, :-1])

num_top_terms = 10
top_terms = {}

for topic_idx, topic in enumerate(lda_model.components_):
    top_term_indices = topic.argsort()[-num_top_terms:][::-1]
    top_terms[topic_idx] = [document_term_matrix_df.columns[idx] for idx in top_term_indices]

for topic, terms in top_terms.items():
    print(f"Topic {topic}: {', '.join(terms)}")
    
end_time = time.time()

total_time = end_time - start_time
print(f"Execution time: {total_time} seconds")

Topic 0: wednesday afternoon, murder take, construction company, take place, murder take place, knife wound, scott kelley, five pm, hung around, construction company wednesday
Topic 1: front door, key front, key front door, howard ellington, body found, jones apartment, key vault, construction company, anything else, opened key
Topic 2: miss smith, often followed, albert greenbags president, greenbags president, pm thursday, lobby apartment, albert greenbags, miss miss, let check, smith lobby
Topic 3: elwood smith, margaret ellington, kelley wife, one hour, discovered robbery, elwood smith janitor, smith janitor, mexico city, dead one, dead one hour
Topic 4: albert greenbags, elwood smith, found park, make sense, arrival time, dog walk, jones shot, body found park, shot intruder, body found
Topic 5: scott apartment, went scott, kelley went, went scott apartment, kelley went scott, kelley blood, smith yard, wife went, miss smith, found miss
Topic 6: thursday night, dirsey flower, left b

In [67]:
# Assign topic labels to each document based on highest topic probability
doc_topic_probs = lda_model.transform(document_term_matrix_df.iloc[:, :-1])
doc_topic_labels = doc_topic_probs.argmax(axis=1)

In [70]:
len(doc_topic_labels)

4521

# Coding dataset

In [62]:
data['Topic'] = kmeans.labels_
for topic_num in range(7):
    original_data_with_topics[f'Topic_{topic_num}'] = 0

for index, row in data.iterrows():
    topic = row['Topic']
    original_data_with_topics.at[index, f'Topic_{topic}'] = 1

original_data_with_topics.drop(['information', 'grounds', 'claim',
                                'organization', 'query', 'social',
                                'strategy'], axis=1).to_csv('coded_dataset_kmeans.csv', index=False)

In [64]:
data['Topic'] = cluster_labels
for topic_num in range(7):
    original_data_with_topics[f'Topic_{topic_num}'] = 0

for index, row in data.iterrows():
    topic = row['Topic']
    original_data_with_topics.at[index, f'Topic_{topic}'] = 1

original_data_with_topics.drop(['information', 'grounds', 'claim',
                                'organization', 'query', 'social',
                                'strategy'], axis=1).to_csv('coded_dataset_hierarchy.csv', index=False)

In [72]:
data['Topic'] = doc_topic_labels
for topic_num in range(7):
    original_data_with_topics[f'Topic_{topic_num}'] = 0

for index, row in data.iterrows():
    topic = row['Topic']
    original_data_with_topics.at[index, f'Topic_{topic}'] = 1

original_data_with_topics.drop(['information', 'grounds', 'claim',
                                'organization', 'query', 'social',
                                'strategy'], axis=1).to_csv('coded_dataset_hierarchy.csv', index=False)