In [1]:
!pip3 install neo4j sentence_transformers pandas



# Build our dataset from Neo4j

In [2]:
import os
import warnings
import textwrap

warnings.filterwarnings("ignore")

NEO4J_URI = "bolt://neo4j.neo4j.svc.cluster.local"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = 'neo4j'

In [4]:
from neo4j import GraphDatabase
auth = ("neo4j", "keZSjc1CaHTakP")
with GraphDatabase.driver(NEO4J_URI, auth=auth) as driver:
    driver.verify_connectivity()

In [5]:
import pandas as pd

# Function to execute the query and return results as a pandas DataFrame
def get_chat_logs_as_dataframe(driver):
    query = """
    MATCH (m:Message)-[:POSTED_IN]->(c:Channel), (u:User)-[:SENT]->(m)
    OPTIONAL MATCH (m)-[:MENTIONED]->(mentioned:User)
    RETURN u.name AS user, c.name AS channel, m.timestamp AS timestamp, m.content AS message
    ORDER BY m.timestamp DESC
    """
    
    # Execute the query
    with driver.session(database="neo4j") as session:
        results = session.run(query)
        
        # Convert results to a DataFrame
        chat_logs_df = pd.DataFrame([record.data() for record in results])
        
        # Optionally, you can save the DataFrame to a CSV file for easy use
        chat_logs_df.to_csv("chat_logs.csv", index=False)
        
        print("Chat logs saved to chat_logs.csv")
        
        return chat_logs_df

# Assuming you have a function to create a driver instance, e.g., `create_driver()`
# driver = create_driver()

# Call the function to get chat logs as a pandas DataFrame
chat_logs_df = get_chat_logs_as_dataframe(driver)

Chat logs saved to chat_logs.csv


# Break the chat logs into conversational contexts

## Possible Approaches:

1. **Clustering**: Use Clustering algorithms to group the chat logs into conversational contexts,
like K-Means or DBSCAN. We'll use the `message` column as the feature to cluster on.
2. **Time-based**: Group chat logs based on a time window, like every 5 minutes.
3. **User-based**: Group chat logs based on the user who sent the message.
4. **Channel-based**: Group chat logs based on the channel where the message was posted.
5. **Sequential**: Group chat logs based on the order they were posted.
6. **Sequence Labeling**: Use Sequence Labeling models to predict the start and end of each conversation, like Named Entity Recognition (NER) models, Conditional Random Fields (CRFs), Hidden Markov Model (HMM) or Long Short-Term Memory (LSTM) networks to label each message with a conversation ID. To do this, we need to train our model on labeled data to learn the conversational patterns that distinguish between different conversational threads. We can use this model to label new messages automatically.
7. **Transformer Based Methods**: Use transformer-based models like BERT, GPT-2, or RoBERTa to generate embeddings for each message and cluster them based on the embeddings.

In [6]:
# Approach 1: Clustering

In [7]:
# Basic text cleaning and preprocessing
# You might want to expand this with more sophisticated cleaning
chat_logs_df['message_clean'] = chat_logs_df['message'].str.lower().str.replace('[^\w\s]', '', regex=True)

# Display the first few rows of the DataFrame
print(chat_logs_df.head())

       user       channel                         timestamp  \
0      leku      #!chases  2024-04-01T02:21:46.712646+00:00   
1  larsinio      #!chases  2024-04-01T02:16:12.510311+00:00   
2      leku  #singularity  2024-04-01T02:01:10.481358+00:00   
3     bysin  #singularity  2024-04-01T02:01:01.059385+00:00   
4      leku  #singularity  2024-04-01T02:00:56.648500+00:00   

                     message              message_clean  
0                        re?                         re  
1                        sup                        sup  
2  i dont understand pokemon  i dont understand pokemon  
3                   last one                   last one  
4                  which one                  which one  


## Generate embeddings

In [8]:
!pip install sentence_transformers



In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173254e-02 -4.28515784e-02 -1.56286024e-02  1.40537536e-02
  3.95538062e-02  1.21796325e-01  2.94333436e-02 -3.17524485e-02
  3.54959629e-02 -7.93140307e-02  1.75878238e-02 -4.04369608e-02
  4.97259349e-02  2.54913010e-02 -7.18699843e-02  8.14968795e-02
  1.47071958e-03  4.79627587e-02 -4.50336002e-02 -9.92174968e-02
 -2.81769242e-02  6.45045564e-02  4.44670394e-02 -4.76217270e-02
 -3.52952555e-02  4.38671224e-02 -5.28565869e-02  4.33025125e-04
  1.01921491e-01  1.64072793e-02  3.26996781e-02 -3.45986858e-02
  1.21339187e-02  7.94870779e-02  4.58346261e-03  1.57778636e-02
 -9.68207140e-03  2.87626237e-02 -5.05806766e-02 -1.55794332e-02
 -2.87907198e-02 -9.62278899e-03  3.15556452e-02  2.27349605e-02
  8.71449709e-02 -3.85027714e-02 -8.84718373e-02 -8.75491835e-03
 -2.12343764e-02  2.08924264e-02 -9.02078152e-02 -5.25732227e-02
 -1.05638616e-02  2.88310945e-02 -1.61454994e-02  6.17835484e-03
 -1.23234

In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
message_embeddings = model.encode(chat_logs_df['message_clean'].tolist(), show_progress_bar=True)
print("Embeddings generated successfully!")

# verify the shape of the embeddings
print(message_embeddings)


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Embeddings generated successfully!
[[-0.01951109  0.03457575 -0.01666949 ...  0.05298835  0.08593491
   0.01744498]
 [-0.13658412  0.01173701 -0.00966252 ...  0.02726141 -0.01584398
   0.02466037]
 [-0.02697346  0.02436424 -0.00103802 ...  0.08083023  0.03924307
  -0.03280762]
 ...
 [-0.03119153  0.01449908 -0.04481903 ... -0.01519591  0.09675826
   0.0919901 ]
 [-0.08145887  0.01799925  0.06027577 ... -0.04238012 -0.00299865
   0.02132331]
 [-0.04105745 -0.06912649 -0.04070404 ... -0.00821245  0.0505823
   0.04224037]]


## Cluster the embeddings

In [11]:
!pip3 install hdbscan

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting hdbscan
  Using cached hdbscan-0.8.33.tar.gz (5.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting cython<3,>=0.27 (from hdbscan)
  Using cached Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Using cached Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hdbscan: filename=hdbscan-0.8.33-cp311-cp311-linux_x86_64.whl size=732941 sha256=671144d4493e0efd4b967f4d1fdf99aacf2136204547823a2e0117a5f497be9b
  Stored in directory: /home/jovyan/.cache/pip/wheels/4e/8c/6f/d0495e4e40cbd27a8c7330d4e963837e099d6e16014dbdcdb5
Successfully built hdbscan
Installing collected packages: cython, h

In [12]:
import hdbscan

# Normalize embeddings to improve clustering
message_embeddings_normalized = normalize(message_embeddings)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(message_embeddings_normalized)

# Add cluster labels to your DataFrame
chat_logs_df['cluster'] = cluster_labels


## Analyze the clusters

In [13]:
# Explore the number of messages per cluster
print(chat_logs_df['cluster'].value_counts())

# Inspect a specific cluster
print(chat_logs_df[chat_logs_df['cluster'] == 0])


cluster
 1    191
-1    153
 0      6
 2      6
Name: count, dtype: int64
         user   channel                         timestamp      message  \
35       leku  #!chases  2024-04-01T01:34:30.653665+00:00  `coins dusk   
52       leku  #!chases  2024-04-01T00:36:35.372251+00:00  `coins dusk   
172      leku  #!chases  2024-03-31T18:34:07.417986+00:00  `coins dusk   
186  larsinio  #!chases  2024-03-31T18:02:21.390198+00:00  `coins doge   
187  larsinio  #!chases  2024-03-31T18:02:17.574258+00:00   `coins dot   
197      leku  #!chases  2024-03-31T17:39:24.729678+00:00  `coins dusk   

    message_clean  cluster  
35     coins dusk        0  
52     coins dusk        0  
172    coins dusk        0  
186    coins doge        0  
187     coins dot        0  
197    coins dusk        0  
