# Step1: Setup and Load Dataset

In [3]:
import numpy as np
import pandas as pd
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import os


# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load SpaCy model
nlp = spacy.load("en_core_web_lg")

# Load the Coursera dataset
df = pd.read_csv("Coursera_reviews.csv")

# Show sample
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4,google-cbrs-cpi-training
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4,google-cbrs-cpi-training
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4,google-cbrs-cpi-training
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4,google-cbrs-cpi-training
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4,google-cbrs-cpi-training


# Step 2: Text Cleaning Function

In [4]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\w*\d\w*", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Step 3: Tokenization + Lemmatization

In [5]:
def preprocess_text(text):
    text = clean_text(text)
    doc = nlp(text)
    lemmatized = [
        token.lemma_ for token in doc
        if token.lemma_ not in stop_words
        and token.pos_ in ["NOUN", "ADJ", "VERB"]
        and token.is_alpha
    ]
    return " ".join(lemmatized)

# Step 4: Apply Preprocessing to Dataset

In [7]:
# Check if 'review' or 'text' column exists
if 'review' in df.columns:
    text_column = 'review'
elif 'text' in df.columns:
    text_column = 'text'
else:
    text_column = df.columns[0]  # Fallback to first column

# Drop NA and duplicates
df = df.dropna(subset=[text_column]).drop_duplicates()

# Apply preprocessing
tqdm.pandas()
df['cleaned_text'] = df[text_column].progress_apply(preprocess_text)

# Save cleaned data
df.to_csv(r"C:\Users\ASUS\Desktop\ML\cleaned_text.csv", index=False)
df[['cleaned_text']].head()

100%|██████████| 519886/519886 [46:34<00:00, 186.05it/s]  


Unnamed: 0,cleaned_text
0,dry able pass complete watch happy usual quest...
1,well experience video screen shot sho side tex...
2,information perfect program little annoying wa...
3,grammatical mistake test make double take bad
4,excellent course training provide detailed eas...
