In [25]:
import re
import pandas as pd
from langdetect import detect
import nltk
import spacy
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
import os

current_path = os.getcwd()
print("Current path:", current_path)


Current path: /home/breezy-s-pc/Study/student_feedback_sentiment_analysis/notebook


In [3]:
custom_directory = "../artifacts"

nltk.download('punkt', download_dir=custom_directory)
nltk.download('stopwords', download_dir=custom_directory)


[nltk_data] Downloading package punkt to ../artifacts...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../artifacts...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#change the nltk base directory
nltk.data.path.append(custom_directory)

# Going to read the csv file

In [5]:
df = pd.read_csv("../artifacts/allReviews.csv")

In [6]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,UserName,Date,Comment,Likes,Rating
0,0,0.0,Robert H,"Mar 3, 2020",I already have a background in naturopathic me...,14.0,4.0
1,1,1.0,MAKARANON W,"Apr 1, 2019",Very good course and it suits for everyone who...,6.0,5.0
2,2,2.0,Morgan,"Jun 26, 2019",A good introduction to herbal medicine. Not ve...,5.0,5.0
3,3,3.0,Javeria F,"Sep 16, 2020",I'm glad to be a part of this course. As it ch...,4.0,5.0
4,4,4.0,Zsuzsanna D,"Sep 21, 2020",Although I have already learned a lot regardin...,3.0,5.0
...,...,...,...,...,...,...,...
903299,903299,39.0,Christina S,"Oct 28, 2016",Great course to learn about the various web to...,0.0,4.0
903300,903300,40.0,Choo C S V,"Oct 25, 2016",Thank you for introducing me to so much Web 2....,0.0,4.0
903301,903301,41.0,Danial B O,"Oct 18, 2016",Free Tools for Interactive Classroom Learning,0.0,4.0
903302,903302,42.0,Cristia S L,"Mar 9, 2019",EXCELLENT,0.0,4.0


In [9]:
df = df[["Comment","Rating"]]

In [10]:
df

Unnamed: 0,Comment,Rating
0,I already have a background in naturopathic me...,4.0
1,Very good course and it suits for everyone who...,5.0
2,A good introduction to herbal medicine. Not ve...,5.0
3,I'm glad to be a part of this course. As it ch...,5.0
4,Although I have already learned a lot regardin...,5.0
...,...,...
903299,Great course to learn about the various web to...,4.0
903300,Thank you for introducing me to so much Web 2....,4.0
903301,Free Tools for Interactive Classroom Learning,4.0
903302,EXCELLENT,4.0


# filtering the english review

In [15]:
!pip install joblib



<h3>import joblib for parallel processing</h3>
<p>use this for use all the cpu cores and threds</p>

In [18]:
from joblib import Parallel, delayed

# definition for english language detect

In [19]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# use the joblib parallel "is_english" function

In [21]:
english_flags = Parallel(n_jobs=-1)(
    delayed(is_english)(text) for text in df['Comment']
)

# remove the other language and filter the english

In [22]:
df['is_english'] = english_flags
df = df[df['is_english']].drop(columns='is_english').reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_english'] = english_flags


In [23]:
df

Unnamed: 0,Comment,Rating
0,I already have a background in naturopathic me...,4.0
1,Very good course and it suits for everyone who...,5.0
2,A good introduction to herbal medicine. Not ve...,5.0
3,I'm glad to be a part of this course. As it ch...,5.0
4,Although I have already learned a lot regardin...,5.0
...,...,...
746995,Great course but the tools need updating. Amaz...,4.0
746996,Great course to learn about the various web to...,4.0
746997,Thank you for introducing me to so much Web 2....,4.0
746998,Free Tools for Interactive Classroom Learning,4.0


# make solid copy to protect the work

In [24]:
df_copy =df.copy()

 # remove the stop words using joblib for fast

In [26]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text']  = Parallel(n_jobs=-1)(delayed(clean_text)(text) for text in df['Comment'])

In [27]:
df

Unnamed: 0,Comment,Rating,cleaned_text
0,I already have a background in naturopathic me...,4.0,i already have a background in naturopathic me...
1,Very good course and it suits for everyone who...,5.0,very good course and it suits for everyone who...
2,A good introduction to herbal medicine. Not ve...,5.0,a good introduction to herbal medicine not ver...
3,I'm glad to be a part of this course. As it ch...,5.0,im glad to be a part of this course as it chan...
4,Although I have already learned a lot regardin...,5.0,although i have already learned a lot regardin...
...,...,...,...
746995,Great course but the tools need updating. Amaz...,4.0,great course but the tools need updating amazi...
746996,Great course to learn about the various web to...,4.0,great course to learn about the various web to...
746997,Thank you for introducing me to so much Web 2....,4.0,thank you for introducing me to so much web tools
746998,Free Tools for Interactive Classroom Learning,4.0,free tools for interactive classroom learning


# using lemmatize text
<p>for more accuracy we use lemmatize text instead of stemming</p>
<p>nlp.pipe(n_process=…)is	Designed for spaCy for safe parallelism and it  keeps order	Only works with spaCy</p>

In [31]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_doc(doc):
    return ' '.join([
        token.lemma_
        for token in doc
        if token.is_alpha and not token.is_stop
    ])

# Use nlp.pipe to process docs in parallel
docs = nlp.pipe(df['cleaned_text'], n_process=-1)

# Apply lemmatization
df['lemmatized_text'] = [lemmatize_doc(doc) for doc in docs]


In [32]:
df.to_csv("../artifacts/cleaned_text.csv", index=True)

In [1]:
df

NameError: name 'df' is not defined