# Build our dataset from Neo4j

In [None]:
import os
import warnings
import textwrap

warnings.filterwarnings("ignore")

NEO4J_URI = "bolt://neo4j.neo4j.svc.cluster.local"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = 'neo4j'

In [None]:
from neo4j import GraphDatabase
auth = ("neo4j", "keZSjc1CaHTakP")
with GraphDatabase.driver(NEO4J_URI, auth=auth) as driver:
    driver.verify_connectivity()

In [None]:
import pandas as pd

# Function to execute the query and return results as a pandas DataFrame
def get_chat_logs_as_dataframe(driver):
    query = """
    MATCH (m:Message)-[:POSTED_IN]->(c:Channel), (u:User)-[:SENT]->(m)
    OPTIONAL MATCH (m)-[:MENTIONED]->(mentioned:User)
    RETURN u.name AS user, c.name AS channel, m.timestamp AS timestamp, m.content AS message
    ORDER BY m.timestamp DESC
    """
    
    # Execute the query
    with driver.session(database="neo4j") as session:
        results = session.run(query)
        
        # Convert results to a DataFrame
        chat_logs_df = pd.DataFrame([record.data() for record in results])
        
        # Optionally, you can save the DataFrame to a CSV file for easy use
        chat_logs_df.to_csv("chat_logs.csv", index=False)
        
        print("Chat logs saved to chat_logs.csv")
        
        return chat_logs_df

# Call the function to get chat logs as a pandas DataFrame
chat_logs_df = get_chat_logs_as_dataframe(driver)

# Break the chat logs into conversational contexts

## Possible Approaches:

1. **Clustering**: Use Clustering algorithms to group the chat logs into conversational contexts,
like K-Means or DBSCAN. We'll use the `message` column as the feature to cluster on.
2. **Time-based**: Group chat logs based on a time window, like every 5 minutes.
3. **User-based**: Group chat logs based on the user who sent the message.
4. **Channel-based**: Group chat logs based on the channel where the message was posted.
5. **Sequential**: Group chat logs based on the order they were posted.
6. **Sequence Labeling**: Use Sequence Labeling models to predict the start and end of each conversation, like Named Entity Recognition (NER) models, Conditional Random Fields (CRFs), Hidden Markov Model (HMM) or Long Short-Term Memory (LSTM) networks to label each message with a conversation ID. To do this, we need to train our model on labeled data to learn the conversational patterns that distinguish between different conversational threads. We can use this model to label new messages automatically.
7. **Transformer Based Methods**: Use transformer-based models like BERT, GPT-2, or RoBERTa to generate embeddings for each message and cluster them based on the embeddings.

# Data Preprocessing and Cleaning

In [None]:
# Basic text cleaning and preprocessing
# You might want to expand this with more sophisticated cleaning
chat_logs_df['message_clean'] = chat_logs_df['message'].str.lower().str.replace('[^\w\s]', '', regex=True)

## Remove stop words

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## Lemmatization

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lemmatize_text)


## Removing frequent but unimportant words

In [None]:
# build frequent_words
from collections import Counter

# Tokenize the cleaned messages into lists of words
chat_logs_df['tokens'] = chat_logs_df['message_clean'].str.split()

# Flatten the list of token lists into a single list
all_words = [word for tokens in chat_logs_df['tokens'] for word in tokens]

# Count the words
word_counts = Counter(all_words)

# Set a frequency threshold
frequency_threshold = 100  # This is just an example value

# Filter words that meet or exceed the threshold
frequent_words = {word for word, count in word_counts.items() if count >= frequency_threshold}

chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in frequent_words]))

## Removing rare words

In [None]:
# Set a frequency threshold for rare words
#rare_threshold = 2  # Example: words appearing 2 times or less

# Filter words that are equal to or below the threshold
#rare_words = {word for word, count in word_counts.items() if count <= rare_threshold}

#chat_logs_df['message_clean'] = chat_logs_df['message_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))


## Generate embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
message_embeddings = model.encode(chat_logs_df['message_clean'].tolist(), show_progress_bar=True)
print("Embeddings generated successfully!")

# verify the shape of the embeddings
print(message_embeddings)


## Cluster the embeddings

In [None]:
import hdbscan

# Normalize embeddings to improve clustering
message_embeddings_normalized = normalize(message_embeddings)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(message_embeddings_normalized)

# Add cluster labels to your DataFrame
chat_logs_df['cluster'] = cluster_labels


## Analyze the clusters

In [None]:
# Explore the number of messages per cluster
print(chat_logs_df['cluster'].value_counts())

# Inspect a specific cluster
#print(chat_logs_df[chat_logs_df['cluster'] == 0])

# Group the DataFrame by the 'cluster' column
grouped_df = chat_logs_df.groupby('cluster')

# Iterate through each group
for cluster_label, group in grouped_df:
    print(f"Cluster: {cluster_label}")
    print(group)  # 'group' is a DataFrame containing only the rows from this cluster
    # You can perform further analysis or processing on each group here


## Sample messages from each cluster

In [None]:
for cluster in sorted(chat_logs_df['cluster'].unique()):
    print(f"Cluster {cluster}:")
    sample_texts = chat_logs_df[chat_logs_df['cluster'] == cluster]['message'].sample(n=5)
    for text in sample_texts:
        print(f"- {text}")
    print("\n")


## Keyword and Phrase analysis

In [None]:
from collections import Counter

for cluster in sorted(chat_logs_df['cluster'].unique()):
    words = [word for message in chat_logs_df[chat_logs_df['cluster'] == cluster]['message_clean'] for word in message.split()]
    word_counts = Counter(words)
    print(f"Cluster {cluster} common words: {word_counts.most_common(10)}")


In [None]:
## Cluster size distribution

In [None]:
import matplotlib.pyplot as plt

cluster_sizes = chat_logs_df.groupby('cluster').size()
cluster_sizes.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Size')
plt.title('Cluster Size Distribution')
plt.show()


## Word Clouds

In [None]:
from wordcloud import WordCloud

for cluster in sorted(chat_logs_df['cluster'].unique()):
    text = " ".join(message for message in chat_logs_df[chat_logs_df['cluster'] == cluster]['message_clean'])
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Cluster {cluster}")
    plt.show()


## Topic Modeling

Apply topic modeling techniques like Latent Dirichlet Allocation (LDA) within each cluster to discover subtopics.

In [31]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 200  # Adjust based on your dataset
#vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
vectorizer = CountVectorizer(stop_words='english')

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

for cluster in sorted(chat_logs_df['cluster'].unique()):
    texts = chat_logs_df[chat_logs_df['cluster'] == cluster]['message_clean']
    if len(texts) == 0:
        continue  # Skip clusters with no texts
    dtm = vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=5, random_state=0)  # Adjust n_components as needed
    lda.fit(dtm)
    
    print(f"\nCluster {cluster}:")
    tf_feature_names = vectorizer.get_feature_names_out()
    display_topics(lda, tf_feature_names, no_top_words=10)




Cluster -1:
Topic 0:
movie mayan sig labor lime need year look arnold human
Topic 1:
sig slave time risk historical easter gallon work reception procedure
Topic 2:
watch let calorie genre pho place thing feel usdt holland
Topic 3:
long town people egg think apocalypto wake tie hate baseball
Topic 4:
fuel uranium rod like cost sig let spend construct involve

Cluster 0:
Topic 0:
dusk fe2 place buy coin trans ruin try day believe
Topic 1:
dusk coin ruin trans try awareness day believe fe2 doge
Topic 2:
coin dot dusk fe2 doge day ruin trans try awareness
Topic 3:
coin dusk fe2 try trans awareness ruin believe day doge
Topic 4:
coin doge dusk fe2 dot day ruin trans try awareness

Cluster 1:
Topic 0:
sig 258 figure netherland usa comparable ye small holland region
Topic 1:
sig netherland region holland small ye term usa comparable size
Topic 2:
netherlands maryland size mainland comparison area islands land km² state
Topic 3:
country netherlands kingdom constituent sint caribbean territory

# Processing output from Latent Dirichlet Allocation (LDA)

In [32]:
# Example of getting topic distribution for the first 5 documents
for i in range(5):
    topic_distribution = lda.transform(dtm[i:i+1])
    print(f"Document {i} topic distribution:", topic_distribution)


Document 0 topic distribution: [[0.01262468 0.94918605 0.01280977 0.01269663 0.01268287]]
Document 1 topic distribution: [[0.00671778 0.00675315 0.00673162 0.00671924 0.9730782 ]]
Document 2 topic distribution: [[0.01055997 0.01056068 0.01067255 0.9575465  0.0106603 ]]
Document 3 topic distribution: [[0.00742245 0.00744567 0.97018283 0.00741793 0.00753112]]
Document 4 topic distribution: [[0.00693061 0.00696375 0.97222228 0.00694487 0.0069385 ]]


In [35]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
# panel = pyLDAvis.lda_model.prepare(lda, dtm, vectorizer, mds='pcoa')
panel = pyLDAvis.lda_model.prepare(lda, dtm, vectorizer, mds='tsne')
pyLDAvis.display(panel)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av