In [1]:
#Data Preprocessing And Cleaning
import re
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Step 1: Environment Setup
!pip install pandas numpy openpyxl keybert transformers torch scikit-learn


# Loading the Merged Comment data
comments_df = pd.read_csv("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 5)/FINAL YEAR PROJECT/Developments/Simplified Data/Merged Files/Merged Comments V3.csv")

# Loading the Video with Transcription data
videos_df = pd.read_excel("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 5)/FINAL YEAR PROJECT/Developments/Simplified Data/Transcriptions/Video with Transcription.xlsx")

#Inspecting Data
comments_df.head()
videos_df.head()

# Step 2: Handling Missing Values Section
# Check missing values in comments
print(comments_df.isnull().sum())

# Remove missing values or fill with placeholder text
comments_df.dropna(subset=['comments'], inplace=True)

# Alternatively, to fill:
# comments_df['comments'].fillna("No comment provided", inplace=True)

# Check missing values in videos
print(videos_df.isnull().sum())

videos_df.dropna(subset=['transcription'], inplace=True)

# Step 3: Removing duplicates Section
# Remove duplicates from comments
comments_df.drop_duplicates(subset=['comments'], inplace=True)

# Remove duplicates from videos
videos_df.drop_duplicates(subset=['transcription'], inplace=True)

# Step4: Text Normalization & Cleaning
def clean_text(text):
    text = str(text).lower()  # convert text to lowercase
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # normalize whitespace
    return text

# Apply cleaning to comments and transcriptions
comments_df['clean_comments'] = comments_df['comments'].apply(clean_text)
videos_df['clean_transcriptions'] = videos_df['transcription'].apply(clean_text)

#Step 5: Tokenization
# Tokenize the cleaned comments and transcriptions
comments_df['tokens'] = comments_df['clean_comments'].apply(word_tokenize)
videos_df['tokens'] = videos_df['clean_transcriptions'].apply(word_tokenize)

# Step 6: Stopword Removal
stop_words = set(stopwords.words('english'))

# Define function for removing stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

comments_df['tokens_no_stopwords'] = comments_df['tokens'].apply(remove_stopwords)
videos_df['tokens_no_stopwords'] = videos_df['tokens'].apply(remove_stopwords)

#Step 7: Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

comments_df['lemmatized'] = comments_df['tokens_no_stopwords'].apply(lemmatize_tokens)
videos_df['lemmatized'] = videos_df['tokens_no_stopwords'].apply(lemmatize_tokens)

#Step 8: Preparing for Keyword Extraction (KeyBERT)
comments_df['final_text'] = comments_df['lemmatized'].apply(lambda x: ' '.join(x))
videos_df['final_text'] = videos_df['lemmatized'].apply(lambda x: ' '.join(x))

#Step 9: Save Preprocessed Data
comments_df.to_csv('clean_comments.csv', index=False)
videos_df.to_csv('clean_transcriptions.csv', index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


Vid             135
Cid               0
RepliesToId    5982
Comments          0
uniqueId          4
videoWebUrl       0
dtype: int64


KeyError: ['comments']

In [31]:
#STEP 9
#Installing NLTK
#Pass this point is successful one

import nltk

# List of essential NLTK resources
resources = [
    ('punkt', 'tokenizers/punkt'),
    ('stopwords', 'corpora/stopwords'),
    ('wordnet', 'corpora/wordnet'),
    ('omw-1.4', 'corpora/omw-1.4')  # Optional, for extended lemmatization support
]

for resource_name, resource_path in resources:
    try:
        nltk.data.find(resource_path)
        print(f"✅ '{resource_name}' already exists.")
    except LookupError:
        print(f"⏳ Downloading '{resource_name}'...")
        nltk.download(resource_name)

# Verification
print("\n🔍 Verifying downloads...")
for resource_name, resource_path in resources:
    try:
        nltk.data.find(resource_path)
        print(f"✅ '{resource_name}' successfully downloaded and verified.")
    except LookupError:
        print(f"❌ '{resource_name}' download failed. Please try manually.")


✅ 'punkt' already exists.
✅ 'stopwords' already exists.
⏳ Downloading 'wordnet'...
⏳ Downloading 'omw-1.4'...

🔍 Verifying downloads...
✅ 'punkt' successfully downloaded and verified.
✅ 'stopwords' successfully downloaded and verified.
❌ 'wordnet' download failed. Please try manually.
❌ 'omw-1.4' download failed. Please try manually.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
#STEPS 10
#KeyBERT Implementation 

import tkinter as tk
from tkinter import filedialog, ttk, messagebox, scrolledtext
import threading
import os
import re
import sys
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keybert import KeyBERT
import time
import traceback

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet', download_dir='./nltk_data', quiet=False)
nltk.download('omw-1.4', download_dir='./nltk_data', quiet=False)

class KeywordExtractionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Keyword Extraction Tool")
        self.root.geometry("800x600")
        self.root.resizable(True, True)
        
        # Initialize variables
        self.comments_file_path = tk.StringVar()
        self.videos_file_path = tk.StringVar()
        self.output_dir = tk.StringVar()
        self.output_dir.set(os.getcwd())
        self.progress_var = tk.DoubleVar()
        self.status_var = tk.StringVar()
        self.status_var.set("Ready")
        self.processing_thread = None
        self.stop_requested = False
        
        # Configure the main frame
        main_frame = ttk.Frame(root, padding="10")
        main_frame.pack(fill=tk.BOTH, expand=True)
        
        # Create the input file selection section
        self.create_file_selection_section(main_frame)
        
        # Create the settings section
        self.create_settings_section(main_frame)
        
        # Create the output section
        self.create_output_section(main_frame)
        
        # Create the log section
        self.create_log_section(main_frame)
        
        # Create the control buttons section
        self.create_control_buttons(main_frame)
        
        # Configure progress bar
        self.progress_bar = ttk.Progressbar(main_frame, variable=self.progress_var, length=780)
        self.progress_bar.pack(fill=tk.X, padx=10, pady=5)
        
        # Configure status bar
        status_bar = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W)
        status_bar.pack(fill=tk.X, padx=10, pady=5)
        
    def create_file_selection_section(self, parent):
        file_frame = ttk.LabelFrame(parent, text="Input Files", padding="10")
        file_frame.pack(fill=tk.X, padx=10, pady=5)
        
        # Comments file selector
        ttk.Label(file_frame, text="Comments File:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
        ttk.Entry(file_frame, textvariable=self.comments_file_path, width=60).grid(row=0, column=1, padx=5, pady=5)
        ttk.Button(file_frame, text="Browse...", command=self.browse_comments_file).grid(row=0, column=2, padx=5, pady=5)
        
        # Videos file selector
        ttk.Label(file_frame, text="Videos File:").grid(row=1, column=0, sticky=tk.W, padx=5, pady=5)
        ttk.Entry(file_frame, textvariable=self.videos_file_path, width=60).grid(row=1, column=1, padx=5, pady=5)
        ttk.Button(file_frame, text="Browse...", command=self.browse_videos_file).grid(row=1, column=2, padx=5, pady=5)
        
    def create_settings_section(self, parent):
        settings_frame = ttk.LabelFrame(parent, text="Processing Settings", padding="10")
        settings_frame.pack(fill=tk.X, padx=10, pady=5)
        
        # Comment column name
        ttk.Label(settings_frame, text="Comment Column:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
        self.comment_column_var = tk.StringVar(value="Comments")
        ttk.Entry(settings_frame, textvariable=self.comment_column_var, width=20).grid(row=0, column=1, padx=5, pady=5)
        
        # Transcription column name
        ttk.Label(settings_frame, text="Transcription Column:").grid(row=0, column=2, sticky=tk.W, padx=5, pady=5)
        self.transcription_column_var = tk.StringVar(value="transcription")
        ttk.Entry(settings_frame, textvariable=self.transcription_column_var, width=20).grid(row=0, column=3, padx=5, pady=5)
        
        # Top N keywords
        ttk.Label(settings_frame, text="Number of Keywords:").grid(row=1, column=0, sticky=tk.W, padx=5, pady=5)
        self.top_n_var = tk.IntVar(value=5)
        ttk.Spinbox(settings_frame, from_=1, to=20, textvariable=self.top_n_var, width=5).grid(row=1, column=1, padx=5, pady=5)
        
        # Batch size
        ttk.Label(settings_frame, text="Batch Size:").grid(row=1, column=2, sticky=tk.W, padx=5, pady=5)
        self.batch_size_var = tk.IntVar(value=100)
        ttk.Spinbox(settings_frame, from_=10, to=500, textvariable=self.batch_size_var, width=5).grid(row=1, column=3, padx=5, pady=5)
        
    def create_output_section(self, parent):
        output_frame = ttk.LabelFrame(parent, text="Output Settings", padding="10")
        output_frame.pack(fill=tk.X, padx=10, pady=5)
        
        # Output directory selector
        ttk.Label(output_frame, text="Output Directory:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
        ttk.Entry(output_frame, textvariable=self.output_dir, width=60).grid(row=0, column=1, padx=5, pady=5)
        ttk.Button(output_frame, text="Browse...", command=self.browse_output_dir).grid(row=0, column=2, padx=5, pady=5)
        
    def create_log_section(self, parent):
        log_frame = ttk.LabelFrame(parent, text="Processing Log", padding="10")
        log_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
        
        # Log text area
        self.log_text = scrolledtext.ScrolledText(log_frame, wrap=tk.WORD, width=80, height=10)
        self.log_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        self.log_text.config(state=tk.DISABLED)
        
    def create_control_buttons(self, parent):
        button_frame = ttk.Frame(parent)
        button_frame.pack(fill=tk.X, padx=10, pady=5)
        
        # Control buttons
        self.start_button = ttk.Button(button_frame, text="Start Processing", command=self.start_processing)
        self.start_button.pack(side=tk.LEFT, padx=5)
        
        self.stop_button = ttk.Button(button_frame, text="Stop", command=self.stop_processing, state=tk.DISABLED)
        self.stop_button.pack(side=tk.LEFT, padx=5)
        
        ttk.Button(button_frame, text="Exit", command=self.root.destroy).pack(side=tk.RIGHT, padx=5)
        
    def browse_comments_file(self):
        filepath = filedialog.askopenfilename(
            filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx;*.xls"), ("All files", "*.*")]
        )
        if filepath:
            self.comments_file_path.set(filepath)
            
    def browse_videos_file(self):
        filepath = filedialog.askopenfilename(
            filetypes=[("Excel files", "*.xlsx;*.xls"), ("CSV files", "*.csv"), ("All files", "*.*")]
        )
        if filepath:
            self.videos_file_path.set(filepath)
            
    def browse_output_dir(self):
        dirpath = filedialog.askdirectory()
        if dirpath:
            self.output_dir.set(dirpath)
            
    def log(self, message):
        self.root.after(0, self._log, message)
        
    def _log(self, message):
        self.log_text.config(state=tk.NORMAL)
        self.log_text.insert(tk.END, f"{message}\n")
        self.log_text.see(tk.END)
        self.log_text.config(state=tk.DISABLED)
        
    def update_status(self, message):
        self.status_var.set(message)
        
    def update_progress(self, value):
        self.progress_var.set(value)
        
    def start_processing(self):
        # Check if input files are provided
        if not self.comments_file_path.get() and not self.videos_file_path.get():
            messagebox.showerror("Error", "Please select at least one input file.")
            return
            
        # Disable start button and enable stop button
        self.start_button.config(state=tk.DISABLED)
        self.stop_button.config(state=tk.NORMAL)
        
        # Reset stop flag
        self.stop_requested = False
        
        # Clear log
        self.log_text.config(state=tk.NORMAL)
        self.log_text.delete(1.0, tk.END)
        self.log_text.config(state=tk.DISABLED)
        
        # Start processing in a separate thread
        self.processing_thread = threading.Thread(target=self.process_files)
        self.processing_thread.daemon = True
        self.processing_thread.start()
        
    def stop_processing(self):
        self.stop_requested = True
        self.log("Stop requested. Waiting for current batch to complete...")
        self.update_status("Stopping...")
        
    def process_files(self):
        try:
            self.log("Starting keyword extraction process...")
            self.update_status("Initializing...")
            
            # Set parameters
            comment_column = self.comment_column_var.get()
            transcription_column = self.transcription_column_var.get()
            top_n = self.top_n_var.get()
            batch_size = self.batch_size_var.get()
            output_dir = self.output_dir.get()
            
            # Download NLTK resources with better error handling
            self.log("Downloading NLTK resources...")
            self.update_status("Downloading NLTK resources...")
            
            # Ensure NLTK data directory exists
            import os
            nltk_data_dir = os.path.expanduser("~/nltk_data")
            if not os.path.exists(nltk_data_dir):
                os.makedirs(nltk_data_dir, exist_ok=True)
                
            # Download resources with download_dir specified and explicit downloading
            try:
                self.log("Downloading punkt tokenizer...")
                nltk.download('punkt', download_dir=nltk_data_dir, quiet=False)
                
                self.log("Downloading stopwords...")
                nltk.download('stopwords', download_dir=nltk_data_dir, quiet=False)
                
                self.log("Downloading wordnet...")
                nltk.download('wordnet', download_dir=nltk_data_dir, quiet=False)
                
                # Verify the downloads
                from nltk.data import find
                try:
                    find('tokenizers/punkt')
                    find('corpora/stopwords')
                    find('corpora/wordnet')
                    self.log("All NLTK resources successfully downloaded and verified.")
                except LookupError as e:
                    self.log(f"Error verifying NLTK resources: {e}")
                    self.log("Attempting alternative download method...")
                    # Try alternative download method
                    import subprocess
                    python_executable = sys.executable
                    subprocess.call([python_executable, '-m', 'nltk.downloader', 'punkt', 'stopwords', 'wordnet'])
                    self.log("Alternative download completed. Continuing with processing...")
                    
            except Exception as e:
                self.log(f"Error downloading NLTK resources: {e}")
                messagebox.showwarning("NLTK Download Warning", 
                                      "There was an issue downloading NLTK resources. You may need to download them manually.\n\n"
                                      "Try running these commands in your Python environment:\n"
                                      "import nltk\n"
                                      "nltk.download('punkt')\n"
                                      "nltk.download('stopwords')\n"
                                      "nltk.download('wordnet')")
            
            # Load data
            comments_df = None
            videos_df = None
            
            if self.comments_file_path.get():
                self.log(f"Loading comments data from {self.comments_file_path.get()}...")
                self.update_status("Loading comments data...")
                if self.comments_file_path.get().lower().endswith('.csv'):
                    comments_df = pd.read_csv(self.comments_file_path.get())
                else:
                    comments_df = pd.read_excel(self.comments_file_path.get())
                self.log(f"Successfully loaded comments data with {len(comments_df)} rows")
                self.log(f"Columns in comments_df: {comments_df.columns.tolist()}")
                
            if self.videos_file_path.get():
                self.log(f"Loading videos data from {self.videos_file_path.get()}...")
                self.update_status("Loading videos data...")
                if self.videos_file_path.get().lower().endswith('.csv'):
                    videos_df = pd.read_csv(self.videos_file_path.get())
                else:
                    videos_df = pd.read_excel(self.videos_file_path.get())
                self.log(f"Successfully loaded videos data with {len(videos_df)} rows")
                self.log(f"Columns in videos_df: {videos_df.columns.tolist()}")
                
            # Check if the specified columns exist
            if comments_df is not None and comment_column not in comments_df.columns:
                self.log(f"Warning: '{comment_column}' column not found in comments DataFrame")
                self.log(f"Available columns: {comments_df.columns.tolist()}")
                potential_columns = [col for col in comments_df.columns if 'comment' in col.lower()]
                if potential_columns:
                    comment_column = potential_columns[0]
                    self.log(f"Using '{comment_column}' as comment column instead")
                else:
                    comments_df = None
                    self.log("No suitable comment column found. Skipping comments processing.")
                
            if videos_df is not None and transcription_column not in videos_df.columns:
                self.log(f"Warning: '{transcription_column}' column not found in videos DataFrame")
                self.log(f"Available columns: {videos_df.columns.tolist()}")
                potential_columns = [col for col in videos_df.columns if 'transcript' in col.lower() or 'text' in col.lower()]
                if potential_columns:
                    transcription_column = potential_columns[0]
                    self.log(f"Using '{transcription_column}' as transcription column instead")
                else:
                    videos_df = None
                    self.log("No suitable transcription column found. Skipping videos processing.")
            
            # Text cleaning and processing functions
            self.log("Setting up text processing functions...")
            
            def clean_text(text):
                if not isinstance(text, str):
                    return ""
                text = text.lower()
                text = re.sub(r'http\S+', '', text)
                text = re.sub(r'[^\w\s]', '', text)
                text = re.sub(r'\d+', '', text)
                text = re.sub(r'\s+', ' ', text).strip()
                return text
            
            stop_words = set(stopwords.words('english'))
            lemmatizer = WordNetLemmatizer()
            
            def process_text(text):
                tokens = word_tokenize(text)
                tokens_no_stopwords = [word for word in tokens if word not in stop_words]
                lemmatized = [lemmatizer.lemmatize(token) for token in tokens_no_stopwords]
                return ' '.join(lemmatized)
            
            # Initialize KeyBERT
            self.log("Initializing KeyBERT model...")
            self.update_status("Initializing KeyBERT model...")
            kw_model = KeyBERT()
            
            # Function to extract keywords
            def extract_keywords(text, top_n=5):
                if not text or len(text.split()) < 3:
                    return []
                try:
                    keywords = kw_model.extract_keywords(
                        text,
                        keyphrase_ngram_range=(1, 2),
                        stop_words='english',
                        use_mmr=True,
                        diversity=0.7,
                        top_n=top_n
                    )
                    return keywords
                except Exception as e:
                    self.log(f"Error extracting keywords: {e}")
                    return []
            
            # Process comments if available
            if comments_df is not None:
                self.log("\nProcessing comments data...")
                self.update_status("Processing comments data...")
                
                # Clean and preprocess text
                self.log("Cleaning and preprocessing comments...")
                comments_df['clean_text'] = comments_df[comment_column].apply(clean_text)
                comments_df['processed_text'] = comments_df['clean_text'].apply(process_text)
                
                # Extract keywords in batches
                self.log("Extracting keywords from comments...")
                keywords_list = []
                total_batches = (len(comments_df) + batch_size - 1) // batch_size
                
                for i in range(0, len(comments_df), batch_size):
                    if self.stop_requested:
                        self.log("Processing stopped by user.")
                        break
                    
                    batch_end = min(i + batch_size, len(comments_df))
                    self.log(f"Processing comment batch {i+1} to {batch_end} of {len(comments_df)}")
                    self.update_status(f"Processing comments: {batch_end}/{len(comments_df)}")
                    
                    batch = comments_df['processed_text'].iloc[i:batch_end]
                    batch_keywords = [extract_keywords(text, top_n) for text in batch]
                    keywords_list.extend(batch_keywords)
                    
                    # Update progress
                    progress = (batch_end / len(comments_df)) * 50  # First 50% of progress
                    self.update_progress(progress)
                    
                # Add keywords to dataframe
                if not self.stop_requested:
                    comments_df['keywords'] = keywords_list
                    comments_df['keywords_only'] = comments_df['keywords'].apply(
                        lambda kw_list: [k for k, _ in kw_list] if isinstance(kw_list, list) else []
                    )
                    
                    # Save processed comments
                    output_path = os.path.join(output_dir, 'comments_with_keywords.csv')
                    self.log(f"Saving processed comments to {output_path}")
                    comments_df.to_csv(output_path, index=False)
                    self.log(f"Saved comments with keywords to {output_path}")
                    
                    # Display sample results
                    self.log("\nSample keywords from comments:")
                    for i in range(min(3, len(comments_df))):
                        self.log(f"Comment: {comments_df[comment_column].iloc[i][:100]}...")
                        self.log(f"Keywords: {comments_df['keywords'].iloc[i]}")
                        self.log("")
            
            # Process videos if available
            if videos_df is not None and not self.stop_requested:
                self.log("\nProcessing video transcription data...")
                self.update_status("Processing video transcriptions...")
                
                # Clean and preprocess text
                self.log("Cleaning and preprocessing transcriptions...")
                videos_df['clean_text'] = videos_df[transcription_column].apply(clean_text)
                videos_df['processed_text'] = videos_df['clean_text'].apply(process_text)
                
                # Extract keywords in batches
                self.log("Extracting keywords from transcriptions...")
                keywords_list = []
                total_batches = (len(videos_df) + batch_size - 1) // batch_size
                
                for i in range(0, len(videos_df), batch_size):
                    if self.stop_requested:
                        self.log("Processing stopped by user.")
                        break
                    
                    batch_end = min(i + batch_size, len(videos_df))
                    self.log(f"Processing transcription batch {i+1} to {batch_end} of {len(videos_df)}")
                    self.update_status(f"Processing transcriptions: {batch_end}/{len(videos_df)}")
                    
                    batch = videos_df['processed_text'].iloc[i:batch_end]
                    batch_keywords = [extract_keywords(text, top_n) for text in batch]
                    keywords_list.extend(batch_keywords)
                    
                    # Update progress
                    base_progress = 50 if comments_df is not None else 0
                    progress = base_progress + (batch_end / len(videos_df)) * (100 - base_progress)
                    self.update_progress(progress)
                    
                # Add keywords to dataframe
                if not self.stop_requested:
                    videos_df['keywords'] = keywords_list
                    videos_df['keywords_only'] = videos_df['keywords'].apply(
                        lambda kw_list: [k for k, _ in kw_list] if isinstance(kw_list, list) else []
                    )
                    
                    # Save processed videos
                    output_path = os.path.join(output_dir, 'videos_with_keywords.csv')
                    self.log(f"Saving processed videos to {output_path}")
                    videos_df.to_csv(output_path, index=False)
                    self.log(f"Saved videos with keywords to {output_path}")
                    
                    # Display sample results
                    self.log("\nSample keywords from videos:")
                    for i in range(min(3, len(videos_df))):
                        self.log(f"Transcription: {videos_df[transcription_column].iloc[i][:100]}...")
                        self.log(f"Keywords: {videos_df['keywords'].iloc[i]}")
                        self.log("")
            
            # Completion message
            if self.stop_requested:
                self.update_status("Processing stopped by user")
                self.log("Processing stopped by user before completion.")
            else:
                self.update_status("Processing completed")
                self.log("Keyword extraction completed successfully!")
                self.update_progress(100)
                messagebox.showinfo("Success", "Keyword extraction completed successfully!")
        
        except Exception as e:
            self.log(f"Error during processing: {e}")
            import traceback
            self.log(traceback.format_exc())
            self.update_status("Error occurred")
            messagebox.showerror("Error", f"An error occurred during processing: {str(e)}")
        
        finally:
            # Re-enable start button and disable stop button
            self.root.after(0, lambda: self.start_button.config(state=tk.NORMAL))
            self.root.after(0, lambda: self.stop_button.config(state=tk.DISABLED))

# Main application launcher
if __name__ == "__main__":
    import sys
    
    # Check if required packages are installed
    try:
        import nltk
        import pandas as pd
        import numpy as np
        from keybert import KeyBERT
        
        # Pre-download NLTK resources before GUI starts
        print("Pre-downloading NLTK resources...")
        nltk_data_dir = os.path.expanduser("~/nltk_data")
        if not os.path.exists(nltk_data_dir):
            os.makedirs(nltk_data_dir, exist_ok=True)
            
        # Function to check if resource exists
        def resource_exists(resource_path):
            try:
                from nltk.data import find
                find(resource_path)
                return True
            except LookupError:
                return False
        
        # Download resources if they don't exist
        if not resource_exists('tokenizers/punkt'):
            print("Downloading punkt tokenizer...")
            nltk.download('punkt', download_dir=nltk_data_dir, quiet=False)
            
        if not resource_exists('corpora/stopwords'):
            print("Downloading stopwords...")
            nltk.download('stopwords', download_dir=nltk_data_dir, quiet=False)
            
        if not resource_exists('corpora/wordnet'):
            print("Downloading wordnet...")
            nltk.download('wordnet', download_dir=nltk_data_dir, quiet=False)
            
    except ImportError as e:
        print(f"Missing required package: {e}")
        print("Please install the required packages using:")
        print("pip install pandas numpy nltk keybert transformers torch scikit-learn openpyxl")
        messagebox.showerror("Missing Packages", 
                            "Some required packages are missing. Please install them using:\n\n"
                            "pip install pandas numpy nltk keybert transformers torch scikit-learn openpyxl")
        exit(1)
    
    # Create and run the application
    root = tk.Tk()
    app = KeywordExtractionApp(root)
    root.mainloop()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ./nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Pre-downloading NLTK resources...


In [8]:
#STEPS 11 - Implementing BERT model
#Try Accuracy Testing MentalBERT

# Install dependencies (uncomment in Colab or terminal)
# !pip install transformers torch pandas scikit-learn

import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

hf_token = "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" # Keep your token secure!

# Load MentalBERT model and tokenizer using the token
try:
    # Try loading with the token argument
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased", token=hf_token)
    model = AutoModel.from_pretrained("mental/mental-bert-base-uncased", token=hf_token)
except TypeError:
    # Fallback for older transformers versions that might not accept 'token' directly
    # In this case, ensure you are logged in via CLI or notebook_login()
    print("Token argument not accepted or login required. Ensure you are logged in via CLI or notebook_login().")
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
    model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")


# Function to generate embedding using MentalBERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Function to rank keywords against a user input
def rank_keywords(df, keyword_column, user_input):
    user_embedding = get_embedding(user_input)
    rankings = []

    for keyword in df[keyword_column].dropna().unique():
        try:
            keyword_embedding = get_embedding(keyword)
            score = cosine_similarity([user_embedding.numpy()], [keyword_embedding.numpy()])[0][0]
            rankings.append((keyword, score))
        except Exception as e:
            print(f"Error processing keyword: {keyword} — {str(e)}")
            continue

    return sorted(rankings, key=lambda x: x[1], reverse=True)

# Load your CSV files
videos_df = pd.read_csv("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 6)/CSP650/Developments/Simplified Data/Testings/comments_with_keywords.csv")
comments_df = pd.read_csv("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 6)/CSP650/Developments/Simplified Data/Testings/videos_with_keywords.csv")

# Input from user
user_input = "I'm feeling down"

# Rank keywords separately for videos and comments
video_ranks = rank_keywords(videos_df, "keywords", user_input)
comment_ranks = rank_keywords(comments_df, "keywords", user_input)

# Convert to DataFrame and display
video_results = pd.DataFrame(video_ranks, columns=["Keyword", "Similarity Score"])
comment_results = pd.DataFrame(comment_ranks, columns=["Keyword", "Similarity Score"])

print("\nTop Video Keywords Relevant to Input:")
print(video_results.head(10))

print("\nTop Comment Keywords Relevant to Input:")
print(comment_results.head(10))


Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [5]:
!pip install transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.preprocessing import normalize

# Your Hugging Face token
hf_token = "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"  # Keep your token secure!

# Load MentalBERT model and tokenizer using the token
try:
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased", token=hf_token)
    model = AutoModel.from_pretrained("mental/mental-bert-base-uncased", token=hf_token)
except TypeError:
    print("Token argument not accepted or login required. Ensure you are logged in via CLI or notebook_login().")
    tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
    model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")

# Function to generate embedding using MentalBERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Function to generate embeddings for a batch of texts
def get_batch_embeddings(texts):
    # Ensure all texts are strings before tokenizing
    texts = [str(text) for text in texts]  # Convert all to strings
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Function to rank keywords against a user input with batch processing
def rank_keywords_batch(df, keyword_column, user_input, batch_size=10):
    user_embedding = get_embedding(user_input)
    rankings = []
    
    # Process keywords in batches with tqdm progress bar
    for i in tqdm(range(0, len(df[keyword_column]), batch_size), desc="Processing keywords"):
        keywords_batch = df[keyword_column].dropna().unique()[i:i+batch_size]
        
        # Convert all keywords to string before embedding
        keywords_batch = [str(keyword) for keyword in keywords_batch]
        
        try:
            keyword_embeddings = get_batch_embeddings(keywords_batch)
            similarities = cosine_similarity([user_embedding.numpy()] * len(keywords_batch), keyword_embeddings.numpy())
            
            for keyword, score in zip(keywords_batch, similarities):
                rankings.append((keyword, score[0]))
        except Exception as e:
            print(f"Error processing batch of keywords: {str(e)}")
            continue

    return sorted(rankings, key=lambda x: x[1], reverse=True)

# Load your CSV files (make sure paths are correct)
videos_df = pd.read_csv("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 6)/CSP650/Developments/Simplified Data/Testings/comments_with_keywords.csv")
comments_df = pd.read_csv("G:/Other computers/My Laptop (1)/Pelajaran UiTM/Classes and Lectures (Semester 6)/CSP650/Developments/Simplified Data/Testings/videos_with_keywords.csv")

# Clean up the data (remove any rows with missing 'keywords')
videos_df = videos_df.dropna(subset=["keywords"])
comments_df = comments_df.dropna(subset=["keywords"])

# Input from user
user_input = "I'm feeling down"

# Normalize embeddings before calculating cosine similarity
normalized_user_embedding = normalize([user_embedding.numpy()])
normalized_keyword_embeddings = normalize(keyword_embeddings)

similarities = cosine_similarity(normalized_user_embedding, normalized_keyword_embeddings)

# Rank keywords separately for videos and comments using batch processing
# Function to rank keywords against a user input with batch processing
def rank_keywords_batch(df, keyword_column, user_input, batch_size=10):
    user_embedding = get_embedding(user_input)

    rankings = []
    
    # Process keywords in batches with tqdm progress bar
    for i in tqdm(range(0, len(df[keyword_column]), batch_size), desc="Processing keywords"):
        # Get the current batch of keywords
        keywords_batch = df[keyword_column].dropna().unique()[i:i+batch_size]
        
        # Ensure the batch is not empty
        if not keywords_batch:
            continue
        
        # Convert all keywords to string before embedding
        keywords_batch = [str(keyword) for keyword in keywords_batch]
        
        try:
            keyword_embeddings = get_batch_embeddings(keywords_batch)
            
            # Move embeddings to CPU if necessary before calculating similarity
            keyword_embeddings = keyword_embeddings.cpu().numpy()
            
            similarities = cosine_similarity([user_embedding.numpy()] * len(keywords_batch), keyword_embeddings)
            
            # Rank keywords based on similarity
            for keyword, score in zip(keywords_batch, similarities):
                rankings.append((keyword, score[0]))

        except Exception as e:
            print(f"Error processing batch of keywords: {str(e)}")
            continue

    return sorted(rankings, key=lambda x: x[1], reverse=True)

# Convert results to DataFrame and display
video_results = pd.DataFrame(video_ranks, columns=["Keyword", "Similarity Score"])
comment_results = pd.DataFrame(comment_ranks, columns=["Keyword", "Similarity Score"])

print("\nTop Video Keywords Relevant to Input:")
print(video_results.head(10))

print("\nTop Comment Keywords Relevant to Input:")
print(comment_results.head(10))




None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'