## Imports

In [42]:
import re
import os
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import spacy
import string
import nltk
from langdetect import detect

from tqdm import tqdm

## Set up environment

In [43]:
current_path = os.getcwd()
print(f"Current path: {current_path}")

Current path: /home/breezy-s-pc/Study/senti with bert/notebook


## Configure NLTK

In [44]:
custom_directory = "../artifacts"
nltk.download('punkt', download_dir=custom_directory, quiet=True)
nltk.download('stopwords', download_dir=custom_directory, quiet=True)
nltk.data.path.append(custom_directory)

## Load data

In [45]:
df = pd.read_csv("../artifacts/allReviews.csv", usecols=["Comment", "Rating"])
print(f"Initial shape: {df.shape}")

Initial shape: (903304, 2)


In [46]:
df.head()

Unnamed: 0,Comment,Rating
0,I already have a background in naturopathic me...,4.0
1,Very good course and it suits for everyone who...,5.0
2,A good introduction to herbal medicine. Not ve...,5.0
3,I'm glad to be a part of this course. As it ch...,5.0
4,Although I have already learned a lot regardin...,5.0


## remove Null values

In [47]:
print(f"Initial shape: {df.shape}")
# remove rows with Null values
df = df.dropna(subset=["Comment", "Rating"])
# remove empty strings
df = df[df["Comment"].str.strip().astype(bool)]
print(f"Shape after null removal: {df.shape}")


Initial shape: (903304, 2)
Shape after null removal: (903144, 2)


## English detection

In [48]:
def detect_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

## Parallel processing with chunking

In [49]:
def parallel_process(func, series, n_jobs=-1, chunk_size=1000):
    chunks = [series[i:i + chunk_size] for i in range(0, len(series), chunk_size)]
    results = Parallel(n_jobs=n_jobs)(
        delayed(func)(chunk) for chunk in chunks
    )
    return np.concatenate(results)

## Detect English in chunks

In [50]:
english_mask = parallel_process(
    lambda chunk: chunk.apply(detect_english).values,
    df['Comment'],
    n_jobs=-1
)
df = df[english_mask].reset_index(drop=True)
print(f"English reviews count: {len(df)}")

English reviews count: 746876


## Text cleaning

In [52]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\d+', '', text)
    return re.sub(r'\s+', ' ', text)


df['cleaned_text'] = parallel_process(
    lambda chunk: chunk.apply(clean_text).values,
    df['Comment'],
    n_jobs=-1
)

## SpaCy setup with efficient pipeline

In [53]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [54]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stop_words = nltk.corpus.stopwords.words('english')

## Enhanced sentiment whitelist

In [55]:
SENTIMENT_WHITELIST = {
    # Intensifiers
    "absolutely", "barely", "completely", "entirely", "exceptionally", "extremely",
    "fully", "highly", "incredibly", "insanely", "marginally", "moderately",
    "particularly", "partially", "quite", "really", "remarkably", "slightly",
    "somewhat", "terribly", "thoroughly", "totally", "too", "utterly", "very",

    # Negations (standard + variants without apostrophes)
    "ain't", "aint", "aren't", "arent", "can't", "cant", "cannot", "couldn't", "couldnt",
    "didn't", "didnt", "doesn't", "doesnt", "don't", "dont", "hadn't", "hadnt",
    "hasn't", "hasnt", "haven't", "havent", "isn't", "isnt", "mightn't", "mightnt",
    "mustn't", "mustnt", "neither", "never", "no", "nobody", "none", "nor", "not",
    "nothing", "nowhere", "shouldn't", "shouldnt", "wasn't", "wasnt", "weren't",
    "werent", "wouldn't", "wouldnt", "wont", "won't",

    # Contrast/Concession
    "although", "but", "despite", "except", "however", "though", "yet"
}


## lemmatization with batch processing

In [56]:


lemmatized_texts = []
docs = nlp.pipe(df['cleaned_text'], n_process=-1, batch_size=50)
for doc in tqdm(docs, total=len(df), desc="Lemmatizing Texts"):
    tokens = [
        token.lemma_ for token in doc
        if (token.is_alpha and not token.is_stop) or token.lemma_.lower() in SENTIMENT_WHITELIST
    ]
    lemmatized_texts.append(" ".join(tokens))
df['lemmatized_text'] = lemmatized_texts

Lemmatizing Texts: 100%|██████████| 746876/746876 [05:12<00:00, 2390.11it/s]


In [57]:
df.head()

Unnamed: 0,Comment,Rating,cleaned_text,lemmatized_text
0,I already have a background in naturopathic me...,4.0,i already have a background in naturopathic me...,background naturopathic medicine take course r...
1,Very good course and it suits for everyone who...,5.0,very good course and it suits for everyone who...,very good course suit like herbal medicinethe ...
2,A good introduction to herbal medicine. Not ve...,5.0,a good introduction to herbal medicine not ver...,good introduction herbal medicine not very use...
3,I'm glad to be a part of this course. As it ch...,5.0,im glad to be a part of this course as it chan...,m glad course change entire perspective utiliz...
4,Although I have already learned a lot regardin...,5.0,although i have already learned a lot regardin...,although learn lot botanical treatment get ver...


In [58]:
print(df["Rating"].value_counts())

Rating
5.0    580056
4.0    108386
3.0     30727
1.0     14680
2.0     13027
Name: count, dtype: int64


## save the csv

In [59]:
df.to_csv("../artifacts/preprocessed_reviews.csv", index=False, header=True)
print("Preprocessing complete!")
print(f"Final shape: {df.shape}")

Preprocessing complete!
Final shape: (746876, 4)


In [60]:
df.head()

Unnamed: 0,Comment,Rating,cleaned_text,lemmatized_text
0,I already have a background in naturopathic me...,4.0,i already have a background in naturopathic me...,background naturopathic medicine take course r...
1,Very good course and it suits for everyone who...,5.0,very good course and it suits for everyone who...,very good course suit like herbal medicinethe ...
2,A good introduction to herbal medicine. Not ve...,5.0,a good introduction to herbal medicine not ver...,good introduction herbal medicine not very use...
3,I'm glad to be a part of this course. As it ch...,5.0,im glad to be a part of this course as it chan...,m glad course change entire perspective utiliz...
4,Although I have already learned a lot regardin...,5.0,although i have already learned a lot regardin...,although learn lot botanical treatment get ver...
