In [14]:
import os 
base_path = r"c:\Users\kau75421\AI startup\Datacollection\split_conversations"
file_name = "conversation_2031.txt"
file_path = os.path.join(base_path, file_name)

# Read the file
with open(file_path, 'r', encoding='utf-8') as f:
    conversation = f.read()



# Display the raw conversation
print("Raw conversation:")
print(conversation)
print("-" * 50)

Raw conversation:
Buyer: Hi, I'm interested in the apartment you listed.
Seller: Of course, do you have a budget in mind?
Buyer: Does it have parking?
Seller: Yes, it comes with one covered parking.
Buyer: What’s the price range?
Seller: It’s a 10-minute walk to the metro station.
Buyer: That sounds interesting, I’ll discuss with my family.
--------------------------------------------------


In [15]:
# Split conversation into messages
messages = conversation.strip().split('\n')

# Calculate engagement metrics
engagement_metrics = {
    'total_messages': len(messages),
    'buyer_messages': len([msg for msg in messages if msg.startswith('Buyer:')]),
    'seller_messages': len([msg for msg in messages if msg.startswith('Seller:')]),
    'response_rate': 0,
    'avg_message_length': sum(len(msg) for msg in messages) / len(messages),
    'questions_asked': len([msg for msg in messages if '?' in msg]),
}

# Calculate seller's response rate to buyer messages
buyer_messages_count = engagement_metrics['buyer_messages']
seller_responses = 0
previous_was_buyer = False

for message in messages:
    if message.startswith('Buyer:'):
        previous_was_buyer = True
    elif message.startswith('Seller:') and previous_was_buyer:
        seller_responses += 1
        previous_was_buyer = False
    else:
        previous_was_buyer = False

# Calculate response rate (seller responses / buyer messages that could have received responses)
if buyer_messages_count > 0:
    engagement_metrics['response_rate'] = seller_responses / buyer_messages_count

# Print analytics
print("Conversation Engagement Analytics:")
print("-" * 30)
print(f"Total Messages: {engagement_metrics['total_messages']}")
print(f"Buyer Messages: {engagement_metrics['buyer_messages']}")
print(f"Seller Messages: {engagement_metrics['seller_messages']}")
print(f"Seller Response Rate: {engagement_metrics['response_rate']:.2%}")
print(f"Direct Seller Responses: {seller_responses}")
print(f"Average Message Length: {engagement_metrics['avg_message_length']:.1f} characters")
print(f"Questions Asked: {engagement_metrics['questions_asked']}")

Conversation Engagement Analytics:
------------------------------
Total Messages: 7
Buyer Messages: 4
Seller Messages: 3
Seller Response Rate: 75.00%
Direct Seller Responses: 3
Average Message Length: 45.4 characters
Questions Asked: 3


In [16]:
# Split conversation into messages
messages = conversation.strip().split('\n')

# Separate buyer and seller messages
buyer_messages = [msg.replace('Buyer: ', '') for msg in messages if msg.startswith('Buyer:')]
seller_messages = [msg.replace('Seller: ', '') for msg in messages if msg.startswith('Seller:')]

# Calculate word lengths
message_word_metrics = {
    'buyer_avg_words': sum(len(msg.split()) for msg in buyer_messages) / len(buyer_messages) if buyer_messages else 0,
    'seller_avg_words': sum(len(msg.split()) for msg in seller_messages) / len(seller_messages) if seller_messages else 0,
    'overall_avg_words': sum(len(msg.split(': ')[1].split()) for msg in messages) / len(messages)
}

# Print analytics
print("Message Word Count Analytics:")
print("-" * 30)
print(f"Buyer Average Words per Message: {message_word_metrics['buyer_avg_words']:.1f} words")
print(f"Seller Average Words per Message: {message_word_metrics['seller_avg_words']:.1f} words")
print(f"Overall Average Words per Message: {message_word_metrics['overall_avg_words']:.1f} words")

# Detailed word counts
print("\nDetailed Word Counts:")
print("-" * 30)
print("\nBuyer Messages:")
for msg in buyer_messages:
    words = msg.split()
    print(f"Words: {len(words)} - Message: {msg}")

print("\nSeller Messages:")
for msg in seller_messages:
    words = msg.split()
    print(f"Words: {len(words)} - Message: {msg}")

Message Word Count Analytics:
------------------------------
Buyer Average Words per Message: 6.0 words
Seller Average Words per Message: 8.0 words
Overall Average Words per Message: 6.9 words

Detailed Word Counts:
------------------------------

Buyer Messages:
Words: 8 - Message: Hi, I'm interested in the apartment you listed.
Words: 4 - Message: Does it have parking?
Words: 4 - Message: What’s the price range?
Words: 8 - Message: That sounds interesting, I’ll discuss with my family.

Seller Messages:
Words: 9 - Message: Of course, do you have a budget in mind?
Words: 7 - Message: Yes, it comes with one covered parking.
Words: 8 - Message: It’s a 10-minute walk to the metro station.


In [17]:
# DATA preprocesing
# Lower casing 
# removing stopwords 
# lemmatization
# tokenization
# removing special characters and punctuation
# seprate buyer and seller messages into different lists
# save the preprocessed data into a csv file with columns 'role' and 'message'

In [18]:
print(conversation)

Buyer: Hi, I'm interested in the apartment you listed.
Seller: Of course, do you have a budget in mind?
Buyer: Does it have parking?
Seller: Yes, it comes with one covered parking.
Buyer: What’s the price range?
Seller: It’s a 10-minute walk to the metro station.
Buyer: That sounds interesting, I’ll discuss with my family.


In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

# Stopwords
stop_words = set(ENGLISH_STOP_WORDS)

def separate_buyer_seller_from_string(conversation: str, deduplicate: bool = False):
    """
    Process conversation string into Buyer and Seller tokens (stopwords removed).
    
    Args:
        conversation: str with lines like 'Buyer: ... Seller: ...'
        deduplicate: if True, removes duplicate words
    
    Returns:
        buyer_tokens, seller_tokens
    """
    
    def clean_text(text: str):
        tokens = re.findall(r"\b\w+\b", text.lower())
        return [w for w in tokens if w not in stop_words and len(w) > 1]
    
    buyer_list, seller_list = [], []
    
    # Split conversation by role markers
    parts = re.split(r"(Buyer:|Seller:)", conversation)
    print(parts)
    
    # parts will look like ["", "Buyer:", "Hi I'm ...", "Seller:", "Of course...", ...]
    for i in range(1, len(parts), 2):
        role = parts[i].replace(":", "").strip().lower()
        text = parts[i+1].strip()
        cleaned = clean_text(text)
        
        if role == "buyer":
            buyer_list.extend(cleaned)
        elif role == "seller":
            seller_list.extend(cleaned)
    
    if deduplicate:
        buyer_list = list(set(buyer_list))
        seller_list = list(set(seller_list))
    
    return buyer_list, seller_list


# ---------------- Example Usage ----------------
buyer_tokens, seller_tokens = separate_buyer_seller_from_string(conversation, deduplicate=True)

print("Buyer tokens:", buyer_tokens)
print("Seller tokens:", seller_tokens)


['', 'Buyer:', " Hi, I'm interested in the apartment you listed.\n", 'Seller:', ' Of course, do you have a budget in mind?\n', 'Buyer:', ' Does it have parking?\n', 'Seller:', ' Yes, it comes with one covered parking.\n', 'Buyer:', ' What’s the price range?\n', 'Seller:', ' It’s a 10-minute walk to the metro station.\n', 'Buyer:', ' That sounds interesting, I’ll discuss with my family.']
Buyer tokens: ['apartment', 'parking', 'sounds', 'price', 'family', 'range', 'discuss', 'hi', 'listed', 'll', 'does', 'interesting', 'interested']
Seller tokens: ['parking', 'metro', 'walk', 'mind', 'course', 'comes', 'yes', 'station', '10', 'minute', 'covered', 'budget']
