In [2]:
import pandas as pd
import spacy
from rake_nltk import Rake
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from collections import Counter
from tqdm import tqdm
import nltk

nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load("en_core_web_lg")

# Load preprocessed data
df = pd.read_csv("cleaned_text.csv")
reviews = df['cleaned_text'].dropna().tolist()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Step 1: Dependency Parsing to Extract Nouns & Noun Phrases

In [5]:
from tqdm import tqdm

texts = df['cleaned_text'].astype(str).fillna("").tolist()
noun_phrases_list = []

for doc in tqdm(nlp.pipe(texts, batch_size=64), total=len(texts)):
    noun_phrases = [
        chunk.lemma_.lower()
        for chunk in doc.noun_chunks
        if 1 <= len(chunk.text.split()) <= 3
    ]
    noun_phrases_list.append(noun_phrases)

df['noun_phrases'] = noun_phrases_list

100%|██████████| 519886/519886 [07:45<00:00, 1116.34it/s]


# Step 2: RAKE Keyword Extraction

In [9]:
# Handle missing values by replacing NaNs with empty strings
df['cleaned_text'] = df['cleaned_text'].fillna("")

# Now run the extraction using ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor

# Define the function to extract RAKE keywords
def extract_rake_keywords(text):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

# Process the text in parallel
with ThreadPoolExecutor() as executor:
    df['rake_keywords'] = list(executor.map(extract_rake_keywords, df['cleaned_text']))

# Combine noun phrases and RAKE keywords
df['aspect_candidates'] = df.apply(
    lambda row: list(set(row['noun_phrases'] + row['rake_keywords'])), axis=1
)


# Step 3: Train Word2Vec Model for Similarity Filtering

In [10]:
# Tokenize sentences
tokenized_reviews = [word_tokenize(text) for text in reviews]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

# Define seed aspects
seed_aspects = ["content", "video", "quiz", "assignment", "teacher", "platform", "course", "usability"]

# Expand seed aspects using similarity
similar_terms = {}
for word in seed_aspects:
    if word in w2v_model.wv:
        similar_terms[word] = w2v_model.wv.most_similar(word, topn=5)

# Display expanded aspect clusters
for k, v in similar_terms.items():
    print(f"\nAspect Cluster: {k}")
    for term, score in v:
        print(f" - {term} ({round(score, 2)})")


Aspect Cluster: content
 - material (0.79)
 - information (0.61)
 - syllabus (0.59)
 - lecture (0.55)
 - topic (0.53)

Aspect Cluster: video
 - lecture (0.73)
 - slide (0.7)
 - clip (0.63)
 - transcript (0.62)
 - podcast (0.61)

Aspect Cluster: quiz
 - test (0.76)
 - quizz (0.74)
 - exam (0.73)
 - homework (0.69)
 - assessment (0.68)

Aspect Cluster: assignment
 - exercise (0.85)
 - assigment (0.83)
 - homework (0.82)
 - assessment (0.76)
 - assignement (0.75)

Aspect Cluster: teacher
 - instructor (0.83)
 - professor (0.82)
 - lecturer (0.8)
 - tutor (0.77)
 - prof (0.66)

Aspect Cluster: platform
 - site (0.58)
 - opportunity (0.58)
 - app (0.55)
 - service (0.53)
 - environment (0.53)

Aspect Cluster: course
 - class (0.65)
 - one (0.54)
 - program (0.46)
 - believe (0.44)
 - necessary (0.43)

Aspect Cluster: usability
 - modernday (0.82)
 - uso (0.82)
 - microcourse (0.82)
 - weekin (0.81)
 - dictatorship (0.81)


# Step 4: Aspect Term Frequency & Filtering

In [11]:
# Flatten all extracted aspect terms
all_aspects = [aspect for sublist in df['aspect_candidates'] for aspect in sublist]
aspect_counter = Counter(all_aspects)

# Filter frequently mentioned aspects (threshold can be adjusted)
common_aspects = {aspect: count for aspect, count in aspect_counter.items() if count >= 10}
sorted_aspects = sorted(common_aspects.items(), key=lambda x: x[1], reverse=True)

print("\nTop 20 Frequent Aspect Terms:")
for aspect, freq in sorted_aspects[:20]:
    print(f"{aspect}: {freq}")


Top 20 Frequent Aspect Terms:
course: 56931
nan: 30697
great course: 19505
good course: 17644
excellent course: 10834
good: 10541
lot: 9087
thank: 7388
people: 4711
amazing course: 4244
excellent: 4171
concept: 3772
nice course: 3597
great: 3329
knowledge: 2911
love course: 2874
love: 2822
awesome course: 2626
thing: 2544
wonderful course: 2348


# Step 5: Combine and Save File 

In [12]:
# After extracting and processing the aspects

# Combine noun phrases and RAKE keywords
df['aspect_candidates'] = df.apply(
    lambda row: list(set(row['noun_phrases'] + row['rake_keywords'])), axis=1
)

# Optionally filter frequently mentioned aspects (e.g., threshold based on count)
all_aspects = [aspect for sublist in df['aspect_candidates'] for aspect in sublist]
aspect_counter = Counter(all_aspects)
common_aspects = {aspect: count for aspect, count in aspect_counter.items() if count >= 10}
sorted_aspects = sorted(common_aspects.items(), key=lambda x: x[1], reverse=True)

# Optionally print the top aspects
print("\nTop Frequent Aspect Terms:")
for aspect, freq in sorted_aspects[:20]:
    print(f"{aspect}: {freq}")

# Save the final DataFrame containing aspect candidates to a CSV file
df.to_csv("final_extracted_aspects.csv", index=False)
print(df.head())  # Check the first few rows of the dataframe


Top Frequent Aspect Terms:
course: 56931
nan: 30697
great course: 19505
good course: 17644
excellent course: 10834
good: 10541
lot: 9087
thank: 7388
people: 4711
amazing course: 4244
excellent: 4171
concept: 3772
nice course: 3597
great: 3329
knowledge: 2911
love course: 2874
love: 2822
awesome course: 2626
thing: 2544
wonderful course: 2348
                                             reviews       reviewers  \
0  Pretty dry, but I was able to pass with just t...     By Robert S   
1  would be a better experience if the video and ...  By Gabriel E R   
2  Information was perfect! The program itself wa...      By Jacob D   
3  A few grammatical mistakes on test made me do ...       By Dale B   
4  Excellent course and the training provided was...       By Sean G   

   date_reviews  rating                 course_id  \
0  Feb 12, 2020       4  google-cbrs-cpi-training   
1  Sep 28, 2020       4  google-cbrs-cpi-training   
2  Apr 08, 2020       4  google-cbrs-cpi-training   
3  Feb 24,