# Text Analytics - Exercise on Basic Text Processing

#### Name: Abdul Hakiim bin Ahmad Rosli (SW01081337)

In [1]:
# Before running the script, download the necessary NLTK data
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [3]:
# Load the dataset
df = pd.read_csv('Reviews.csv')

# Remove duplicates
df.drop_duplicates(subset=['Text'], inplace=True)

In [4]:
# Remove punctuations, special characters, and digits
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Text'] = df['Text'].apply(lambda x: re.sub(r'\d+', '', x))

In [5]:
# Convert to lowercase
df['Text'] = df['Text'].apply(lambda x: x.lower())

In [6]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

In [7]:
# Perform stemming & lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df['Text_stemmed'] = df['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
df['Text_lemmatized'] = df['Text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

In [8]:
# Display the first few rows of processed text
print(df[['Text', 'Text_stemmed', 'Text_lemmatized']].head())

                                                Text  \
0  bought several vitality canned dog food produc...   
1  product arrived labeled jumbo salted peanutsth...   
2  confection around centuries light pillowy citr...   
3  looking secret ingredient robitussin believe f...   
4  great taffy great price wide assortment yummy ...   

                                        Text_stemmed  \
0  bought sever vital can dog food product found ...   
1  product arriv label jumbo salt peanutsth peanu...   
2  confect around centuri light pillowi citru gel...   
3  look secret ingredi robitussin believ found go...   
4  great taffi great price wide assort yummi taff...   

                                     Text_lemmatized  
0  bought several vitality canned dog food produc...  
1  product arrived labeled jumbo salted peanutsth...  
2  confection around century light pillowy citrus...  
3  looking secret ingredient robitussin believe f...  
4  great taffy great price wide assortment yummy ..

In [9]:
df[['Text', 'Text_stemmed', 'Text_lemmatized']].to_csv('processed_reviews.csv', index=False)

### Other way

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# Download necessary NLTK resources (do this if you haven't before)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdulhakiim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
data = pd.read_csv("Reviews.csv")
# Focus on the 'Text' column
text_data = data['Text']

In [12]:
def preprocess_text(text):
    # 1. Remove Duplicates
    text = text.drop_duplicates()

    # 2. Remove Punctuations and Special Characters
    text = text.apply(lambda x: re.sub(r'[^\w\s]', '', x)) 

    # 3. Remove Digits
    text = text.apply(lambda x: re.sub(r'\d+', '', x)) 

    # 4. Convert to Lowercase
    text = text.apply(lambda x: x.lower())

    # 5. Remove Stop Words
    stop_words = set(stopwords.words('english'))
    text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # 6. Perform Stemming 
    stemmer = PorterStemmer()
    text = text.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    # 7. Perform Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return text

# Apply preprocessing
processed_text = preprocess_text(text_data.copy()) 

In [13]:
# To print a sample of the first few reviews:
for review in processed_text.head():
    print(review)

# To print all reviews:
for review in processed_text:
    print(review)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



nairn ginger oatcak simpli best around couldnt stop eat packet friend respons fabul healthi snack produc reliabl firm doesnt need addit toast plain oatcak benefit
wheatfre diet love nairn oatcak plain one tasti even better chees spread also remind scotland favorit foreign countri visit ginger oatcak die like ginger andor ginger cooki arent sweet addict eat also found wonder substitut graham cracker want make cracker crust pumpkin pie ginger cracker realli goe well pumpkin even friend famili arent wheatfre love pie crust made
buy time great meal snack keep calori count
absolut delici salt want good price box deliv right door happi purchas qualiti good fresh good great food product order case diet
buy dog help control weight salt free howev like peanut butter complaint never know go arriv broken bit last order seem much better last
far best rice cake tri realli crunchi dont styrofoami qualiti cheap rice cake also tast great salti sweet actual tast rice complaint crumbl lot care drop good

In [14]:
# Create a new DataFrame to hold the processed text
processed_data = pd.DataFrame({'Cleaned_Text': processed_text})

# Save the DataFrame to a CSV file
processed_data.to_csv('cleaned_reviews.csv', index=False) 