In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')



try:

    df = pd.read_csv('amazon_reviews.csv', engine='python')
    print("Dataset loaded successfully!")

    print(f"Number of records: {len(df)}")
except FileNotFoundError:
    print("Error: 'amazon_reviews.csv' not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")



print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Dataset loaded successfully!
Number of records: 21214
      Reviewer Name                     Profile Link Country Review Count  \
0        Eugene ath  /users/66e8185ff1598352d6b3701a      US     1 review   
1  Daniel ohalloran  /users/5d75e460200c1f6a6373648c      GB    9 reviews   
2          p fisher  /users/546cfcf1000064000197b88f      GB   90 reviews   
3         Greg Dunn  /users/62c35cdbacc0ea0012ccaffa      AU    5 reviews   
4     Sheila Hannah  /users/5ddbe429478d88251550610e      GB    8 reviews   

                Review Date                  Rating  \
0  2024-09-16T13:44:26.000Z  Rated 1 out of 5 stars   
1  2024-09-16T18:26:46.000Z  Rated 1 out of 5 stars   
2  2024-09-16T21:47:39.000Z  Rated 1 out of 5 stars   
3  2024-09-17T07:15:49.000Z  Rated 1 out of 5 stars   
4  2024-09-16T18:37:17.000Z  Rated 1 out of 5 stars   

                                      Review Title  \
0       A Store That Doesn't Want to Sell Anything   
1           Had multiple orders one turned u

In [3]:
df.dropna(subset=['Review Text'], inplace=True)

df.drop_duplicates(subset=['Review Text'], inplace=True)

print(f"Shape of dataframe after cleaning missing values and duplicates: {df.shape}")

Shape of dataframe after cleaning missing values and duplicates: (20407, 9)


In [7]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if text is None:
        return ""

    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

    text = text.lower()

    tokens = word_tokenize(text)

    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
    ]

    return " ".join(cleaned_tokens)


print("Starting text preprocessing...")
df['cleaned_review_text'] = df['Review Text'].apply(preprocess_text)
print("Preprocessing complete!")

print(df[['Review Text', 'cleaned_review_text']].head())

Starting text preprocessing...
Preprocessing complete!
                                         Review Text  \
0  I registered on the website, tried to order a ...   
1  Had multiple orders one turned up and driver h...   
2  I informed these reprobates that I WOULD NOT B...   
3  I have bought from Amazon before and no proble...   
4  If I could give a lower rate I would! I cancel...   

                                 cleaned_review_text  
0  registered website tried order laptop entered ...  
1  multiple order one turned driver phone door nu...  
2  informed reprobate would going visit sick rela...  
3  bought amazon problem happy service price amaz...  
4  could give lower rate would cancelled amazon p...  


In [8]:
final_df = df[['Rating', 'Review Text', 'cleaned_review_text']]

final_df.to_csv('cleaned_amazon_reviews.csv', index=False)

print("Cleaned dataset saved as 'cleaned_amazon_reviews.csv'")

Cleaned dataset saved as 'cleaned_amazon_reviews.csv'
