In [16]:
!pip3 install neo4j sentence_transformers pandas nltk hdbscan spacy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# Build our dataset from Neo4j

In [17]:
import os
import warnings
import textwrap

warnings.filterwarnings("ignore")

NEO4J_URI = "bolt://neo4j.neo4j.svc.cluster.local"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = 'neo4j'

In [18]:
from neo4j import GraphDatabase
auth = ("neo4j", "keZSjc1CaHTakP")
with GraphDatabase.driver(NEO4J_URI, auth=auth) as driver:
    driver.verify_connectivity()

In [19]:
import pandas as pd

# Function to execute the query and return results as a pandas DataFrame
def get_chat_logs_as_dataframe(driver):
    query = """
    MATCH (m:Message)-[:POSTED_IN]->(c:Channel), (u:User)-[:SENT]->(m)
    OPTIONAL MATCH (m)-[:MENTIONED]->(mentioned:User)
    RETURN u.name AS user, c.name AS channel, m.timestamp AS timestamp, m.content AS message
    ORDER BY m.timestamp DESC
    """
    
    # Execute the query
    with driver.session(database="neo4j") as session:
        results = session.run(query)
        
        # Convert results to a DataFrame
        chat_logs_df = pd.DataFrame([record.data() for record in results])
        
        # Optionally, you can save the DataFrame to a CSV file for easy use
        chat_logs_df.to_csv("chat_logs.csv", index=False)
        
        print("Chat logs saved to chat_logs.csv")
        
        return chat_logs_df

# Call the function to get chat logs as a pandas DataFrame
chat_logs_df = get_chat_logs_as_dataframe(driver)

Chat logs saved to chat_logs.csv


# Break the chat logs into conversational contexts

## Possible Approaches:

1. **Clustering**: Use Clustering algorithms to group the chat logs into conversational contexts,
like K-Means or DBSCAN. We'll use the `message` column as the feature to cluster on.
2. **Time-based**: Group chat logs based on a time window, like every 5 minutes.
3. **User-based**: Group chat logs based on the user who sent the message.
4. **Channel-based**: Group chat logs based on the channel where the message was posted.
5. **Sequential**: Group chat logs based on the order they were posted.
6. **Sequence Labeling**: Use Sequence Labeling models to predict the start and end of each conversation, like Named Entity Recognition (NER) models, Conditional Random Fields (CRFs), Hidden Markov Model (HMM) or Long Short-Term Memory (LSTM) networks to label each message with a conversation ID. To do this, we need to train our model on labeled data to learn the conversational patterns that distinguish between different conversational threads. We can use this model to label new messages automatically.
7. **Transformer Based Methods**: Use transformer-based models like BERT, GPT-2, or RoBERTa to generate embeddings for each message and cluster them based on the embeddings.

# Data Preprocessing and Cleaning

In [20]:
# Basic text cleaning and preprocessing
# You might want to expand this with more sophisticated cleaning
chat_logs_df['message_clean'] = chat_logs_df['message'].str.lower().str.replace('[^\w\s]', '', regex=True)

## Remove stop words

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## Lemmatization

In [23]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lemmatize_text)


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Removing frequent but unimportant words

In [24]:
# build frequent_words
from collections import Counter

# Tokenize the cleaned messages into lists of words
chat_logs_df['tokens'] = chat_logs_df['message_clean'].str.split()

# Flatten the list of token lists into a single list
all_words = [word for tokens in chat_logs_df['tokens'] for word in tokens]

# Count the words
word_counts = Counter(all_words)

# Set a frequency threshold
frequency_threshold = 100  # This is just an example value

# Filter words that meet or exceed the threshold
frequent_words = {word for word, count in word_counts.items() if count >= frequency_threshold}

chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in frequent_words]))

## Removing rare words

In [25]:
# Set a frequency threshold for rare words
#rare_threshold = 2  # Example: words appearing 2 times or less

# Filter words that are equal to or below the threshold
#rare_words = {word for word, count in word_counts.items() if count <= rare_threshold}

#chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))


## Generate embeddings

In [26]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
message_embeddings = model.encode(chat_logs_df['message_clean'].tolist(), show_progress_bar=True)
print("Embeddings generated successfully!")

# verify the shape of the embeddings
print(message_embeddings)


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Embeddings generated successfully!
[[-0.11883839  0.0482987  -0.00254814 ...  0.12640943  0.04654907
  -0.01571724]
 [-0.13658412  0.01173701 -0.00966252 ...  0.02726141 -0.01584398
   0.02466037]
 [-0.0427901   0.04457811  0.01399177 ...  0.06319794  0.03106361
  -0.03714633]
 ...
 [-0.03119157  0.01449915 -0.04481908 ... -0.01519592  0.0967582
   0.09199005]
 [-0.10109371  0.046329    0.04294018 ... -0.03402774 -0.0196047
   0.02392813]
 [-0.07301831 -0.05540538 -0.07172503 ...  0.00799474  0.06458855
   0.03423898]]


## Cluster the embeddings

In [27]:
import hdbscan

# Normalize embeddings to improve clustering
message_embeddings_normalized = normalize(message_embeddings)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(message_embeddings_normalized)

# Add cluster labels to your DataFrame
chat_logs_df['cluster'] = cluster_labels


## Analyze the clusters

In [28]:
# Explore the number of messages per cluster
print(chat_logs_df['cluster'].value_counts())

# Inspect a specific cluster
#print(chat_logs_df[chat_logs_df['cluster'] == 0])

# Group the DataFrame by the 'cluster' column
grouped_df = chat_logs_df.groupby('cluster')

# Iterate through each group
for cluster_label, group in grouped_df:
    print(f"Cluster: {cluster_label}")
    print(group)  # 'group' is a DataFrame containing only the rows from this cluster
    # You can perform further analysis or processing on each group here


cluster
-1    223
 2     49
 5     45
 4     17
 0      8
 1      8
 3      6
Name: count, dtype: int64
Cluster: -1
      user       channel                         timestamp  \
2     leku  #singularity  2024-04-01T02:01:10.481358+00:00   
3    bysin  #singularity  2024-04-01T02:01:01.059385+00:00   
6    bysin  #singularity  2024-04-01T02:00:51.586430+00:00   
9      FE2      #!chases  2024-04-01T01:36:40.417294+00:00   
10     FE2      #!chases  2024-04-01T01:36:35.046911+00:00   
..     ...           ...                               ...   
349    dio      #!chases  2024-03-31T15:49:55.416471+00:00   
350  viral      #!chases  2024-03-31T15:49:33.444184+00:00   
351   leku      #!chases  2024-03-31T15:49:27.437310+00:00   
354   leku      #!chases  2024-03-31T15:49:20.825651+00:00   
355   leku      #𝓉𝓌𝑒𝓇𝓀𝒾𝓃  2024-03-31T15:47:43.883680+00:00   

                              message                  message_clean  \
2           i dont understand pokemon      do not understand pokemo