In [1]:
import re
import pandas as pd
from langdetect import detect
import nltk
import spacy
import string
import os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
import os

current_path = os.getcwd()
print("Current path:", current_path)


Current path: /home/breezy-s-pc/Study/senti_05_22/notebook


In [3]:
custom_directory = "../artifacts"

nltk.download('punkt', download_dir=custom_directory)
nltk.download('stopwords', download_dir=custom_directory)


[nltk_data] Downloading package punkt to ../artifacts...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../artifacts...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#change the nltk base directory
nltk.data.path.append(custom_directory)

# Going to read the csv file

In [5]:
df = pd.read_csv("../artifacts/sample_sentiment_analysis.csv")

In [6]:
df

Unnamed: 0,id,Rating,Comment
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [48]:
df = df[["Comment","Rating"]]

In [49]:
df

Unnamed: 0,Comment,Rating
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,0
1,Finally a transparant silicon case ^^ Thanks t...,0
2,We love this! Would you go? #talk #makememorie...,0
3,I'm wired I know I'm George I was made that wa...,0
4,What amazing service! Apple won't even talk to...,1
...,...,...
7915,Live out loud #lol #liveoutloud #selfie #smile...,0
7916,We would like to wish you an amazing day! Make...,0
7917,Helping my lovely 90 year old neighbor with he...,0
7918,Finally got my #smart #pocket #wifi stay conne...,0


# filtering the english review

In [50]:
!pip install joblib



<h3>import joblib for parallel processing</h3>
<p>use this for use all the cpu cores and threds</p>

In [51]:
from joblib import Parallel, delayed

# definition for english language detect

In [52]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# use the joblib parallel "is_english" function

In [53]:
english_flags = Parallel(n_jobs=-1)(
    delayed(is_english)(text) for text in df['Comment']
)

# remove the other language and filter the english

In [54]:
df['is_english'] = english_flags
df = df[df['is_english']].drop(columns='is_english').reset_index(drop=True)

In [55]:
df

Unnamed: 0,Comment,Rating
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,0
1,Finally a transparant silicon case ^^ Thanks t...,0
2,We love this! Would you go? #talk #makememorie...,0
3,I'm wired I know I'm George I was made that wa...,0
4,What amazing service! Apple won't even talk to...,1
...,...,...
7698,Live out loud #lol #liveoutloud #selfie #smile...,0
7699,We would like to wish you an amazing day! Make...,0
7700,Helping my lovely 90 year old neighbor with he...,0
7701,Finally got my #smart #pocket #wifi stay conne...,0


# import the symspellpy

In [56]:
from symspellpy import SymSpell,Verbosity

# create spell check function

In [57]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

# df['Corrected'] = df['Comment'].apply(correct_spelling)

spell_checked = Parallel(n_jobs=-1)(
    delayed(correct_spelling)(text) for text in df['Comment'])

df['Corrected'] = spell_checked

2025-05-22 14:31:50,734: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


# make solid copy to protect the work

In [58]:
df_copy =df.copy()

 # remove the stop words using joblib for fast

In [59]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text']  = Parallel(n_jobs=-1)(delayed(clean_text)(text) for text in df['Corrected'])

In [60]:
df

Unnamed: 0,Comment,Rating,Corrected,cleaned_text
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,0,fingerprint pregnancy test https goo gl h1mfqv...,fingerprint pregnancy test goo gl hmfqv androi...
1,Finally a transparant silicon case ^^ Thanks t...,0,finally a transparant silicon case thanks to m...,finally a transparant silicon case thanks to m...
2,We love this! Would you go? #talk #makememorie...,0,we love this would you go talk makememories un...,we love this would you go talk makememories un...
3,I'm wired I know I'm George I was made that wa...,0,i'm wired i know i'm george i was made that wa...,im wired i know im george i was made that way ...
4,What amazing service! Apple won't even talk to...,1,what amazing service apple won't even talk to ...,what amazing service apple wont even talk to m...
...,...,...,...,...
7698,Live out loud #lol #liveoutloud #selfie #smile...,0,live out loud lol liveoutloud selfie smile son...,live out loud lol liveoutloud selfie smile son...
7699,We would like to wish you an amazing day! Make...,0,we would like to wish you an amazing day make ...,we would like to wish you an amazing day make ...
7700,Helping my lovely 90 year old neighbor with he...,0,helping my lovely 90 year old neighbor with he...,helping my lovely year old neighbor with her i...
7701,Finally got my #smart #pocket #wifi stay conne...,0,finally got my smart pocket wifi stay connecte...,finally got my smart pocket wifi stay connecte...


# using lemmatize text
<p>for more accuracy we use lemmatize text instead of stemming</p>
<p>nlp.pipe(n_process=…)is	Designed for spaCy for safe parallelism and it  keeps order	Only works with spaCy</p>

In [62]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_doc(doc):
    return ' '.join([
        token.lemma_
        for token in doc
        if token.is_alpha and not token.is_stop
    ])

# Use nlp.pipe to process docs in parallel
docs = nlp.pipe(df['cleaned_text'], n_process=-1)

# Apply lemmatization
df['lemmatized_text'] = [lemmatize_doc(doc) for doc in docs]


In [63]:
df.to_csv("../artifacts/sample_checked.csv", index=True)

In [64]:
df

Unnamed: 0,Comment,Rating,Corrected,cleaned_text,lemmatized_text
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,0,fingerprint pregnancy test https goo gl h1mfqv...,fingerprint pregnancy test goo gl hmfqv androi...,fingerprint pregnancy test goo gl hmfqv androi...
1,Finally a transparant silicon case ^^ Thanks t...,0,finally a transparant silicon case thanks to m...,finally a transparant silicon case thanks to m...,finally transparant silicon case thank uncle y...
2,We love this! Would you go? #talk #makememorie...,0,we love this would you go talk makememories un...,we love this would you go talk makememories un...,love talk makememorie unplug relax iphone smar...
3,I'm wired I know I'm George I was made that wa...,0,i'm wired i know i'm george i was made that wa...,im wired i know im george i was made that way ...,m wire know m george way iphone cute daventry ...
4,What amazing service! Apple won't even talk to...,1,what amazing service apple won't even talk to ...,what amazing service apple wont even talk to m...,amazing service apple will not talk question p...
...,...,...,...,...,...
7698,Live out loud #lol #liveoutloud #selfie #smile...,0,live out loud lol liveoutloud selfie smile son...,live out loud lol liveoutloud selfie smile son...,live loud lol liveoutloud selfie smile sony mu...
7699,We would like to wish you an amazing day! Make...,0,we would like to wish you an amazing day make ...,we would like to wish you an amazing day make ...,like wish amazing day minute count tls today i...
7700,Helping my lovely 90 year old neighbor with he...,0,helping my lovely 90 year old neighbor with he...,helping my lovely year old neighbor with her i...,help lovely year old neighbor ipad morning rea...
7701,Finally got my #smart #pocket #wifi stay conne...,0,finally got my smart pocket wifi stay connecte...,finally got my smart pocket wifi stay connecte...,finally get smart pocket wifi stay connect any...
