# Import libraries

In [2]:
import os
import re
import string
from collections import defaultdict
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer



# READ all text file from current folder and print

In [3]:

folder_path = 'chat_log'  # Change this to your folder
text_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

all_messages = []
for file_path in text_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        chat_txt = file.read()

        pattern = r'(User|AI):\s*(.*?)(?=\n*User:|\n*AI:|$)'
        matches = re.findall(pattern, chat_txt, re.DOTALL)

        for speaker, message in matches:
            all_messages.append({
                'speaker': speaker.strip(),
                'message': message.strip()
            })

print("Extracted messages:")
for msg in all_messages:
    print(msg)      # Print few messages for preview


Extracted messages:
{'speaker': 'User', 'message': 'Hi, can you tell me about Python?'}
{'speaker': 'AI', 'message': 'Sure! Python is a popular programming language known for its readability.'}
{'speaker': 'User', 'message': 'What can I use it for?'}
{'speaker': 'AI', 'message': 'You can use Python for web development, data analysis, AI, and more.'}
{'speaker': 'User', 'message': 'Hello!'}
{'speaker': 'AI', 'message': 'Hi! How can I assist you today?'}
{'speaker': 'User', 'message': 'Can you explain what machine learning is?'}
{'speaker': 'AI', 'message': 'Certainly! Machine learning is a field of AI that allows systems to learn from data.'}


# Separate Speaker User and AI

In [4]:
speaker_messages = defaultdict(list)
for msg in all_messages:
    speaker_messages[msg['speaker']].append(msg['message'])

print("Separated messages by speaker:")
for speaker, msgs in speaker_messages.items():
    print(f"{speaker}: {len(msgs)} messages")


Separated messages by speaker:
User: 4 messages
AI: 4 messages


# Total Line of Message exchange

In [5]:
# Step 3: Count messages and total exchanges (number of individual messages)
total_messages = len(all_messages)
print(f"Total line of messege: {total_messages}")

Total line of messege: 8


# combine USER + AI message

In [6]:


user_text = " ".join(speaker_messages.get("User", []))
ai_text = " ".join(speaker_messages.get("AI", []))
combined_text = user_text + " " + ai_text
print("Combined user + AI text preview:\n", combined_text)  


Combined user + AI text preview:
 Hi, can you tell me about Python? What can I use it for? Hello! Can you explain what machine learning is? Sure! Python is a popular programming language known for its readability. You can use Python for web development, data analysis, AI, and more. Hi! How can I assist you today? Certainly! Machine learning is a field of AI that allows systems to learn from data.


# Remove punctuation and Tokenized

In [7]:

without_punc = str.maketrans('', '', string.punctuation)
clean_text = combined_text.translate(without_punc)


tokens = word_tokenize(clean_text.lower())

print("Tokenized text (after punctuation removal):")
print(tokens)


Tokenized text (after punctuation removal):
['hi', 'can', 'you', 'tell', 'me', 'about', 'python', 'what', 'can', 'i', 'use', 'it', 'for', 'hello', 'can', 'you', 'explain', 'what', 'machine', 'learning', 'is', 'sure', 'python', 'is', 'a', 'popular', 'programming', 'language', 'known', 'for', 'its', 'readability', 'you', 'can', 'use', 'python', 'for', 'web', 'development', 'data', 'analysis', 'ai', 'and', 'more', 'hi', 'how', 'can', 'i', 'assist', 'you', 'today', 'certainly', 'machine', 'learning', 'is', 'a', 'field', 'of', 'ai', 'that', 'allows', 'systems', 'to', 'learn', 'from', 'data']


# remove Stopword (used NLTK built in function)

In [8]:

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]

print("After stopword removal:")
print(filtered_tokens)


After stopword removal:
['hi', 'tell', 'python', 'use', 'hello', 'explain', 'machine', 'learning', 'sure', 'python', 'popular', 'programming', 'language', 'known', 'readability', 'use', 'python', 'web', 'development', 'data', 'analysis', 'ai', 'hi', 'assist', 'today', 'certainly', 'machine', 'learning', 'field', 'ai', 'allows', 'systems', 'learn', 'data']


# lemmatization for better result

In [9]:

def map_pos_to_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN


tagged = pos_tag(filtered_tokens)

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [
    lemmatizer.lemmatize(word, map_pos_to_wordnet(pos_tag))
    for word, pos_tag in tagged
]

print("After POS-aware lemmatization:")
print(lemmatized_tokens)


After POS-aware lemmatization:
['hi', 'tell', 'python', 'use', 'hello', 'explain', 'machine', 'learning', 'sure', 'python', 'popular', 'programming', 'language', 'know', 'readability', 'use', 'python', 'web', 'development', 'data', 'analysis', 'ai', 'hi', 'assist', 'today', 'certainly', 'machine', 'learn', 'field', 'ai', 'allow', 'system', 'learn', 'data']


# Applying TF-IDF

In [10]:

final_text = " ".join(lemmatized_tokens)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([final_text])
tfidf_scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
sorted_keywords = sorted(tfidf_scores, key=lambda x: -x[1])

all_keywords = [word for word, score in sorted_keywords]

print("ALL keywords (TF-IDF):")
print(all_keywords)


ALL keywords (TF-IDF):
['python', 'ai', 'data', 'hi', 'learn', 'machine', 'use', 'allow', 'analysis', 'assist', 'certainly', 'development', 'explain', 'field', 'hello', 'know', 'language', 'learning', 'popular', 'programming', 'readability', 'sure', 'system', 'tell', 'today', 'web']


# Final Outcome

In [11]:

# Final Summary
print("Final Summary:")
print(f"Total number of exchanges: {len(all_keywords)}")
print(f"The user asked mainly about {all_keywords[0]} and {all_keywords[1]}")
print(f"Most common keywords: {', '.join(all_keywords[:5])}")


Final Summary:
Total number of exchanges: 26
The user asked mainly about python and ai
Most common keywords: python, ai, data, hi, learn
