In [7]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/becca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/becca/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/becca/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels .api as sm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
df_reviews_amazon = pd.read_csv('Reviews.csv')

In [11]:
df_reviews_amazon.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [12]:
print(df_reviews_amazon.shape)

(568454, 10)


## Pre processing 

### Text Cleaning

#### Remove stop words, punctuation, and convert text to lowercase.

In [16]:
df_text = df_reviews_amazon[['Text']]
df_text.head()

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...


In [17]:
# Define stop words and punctuation
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', punctuation))
    # Remove stop words
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Apply cleaning function to the dataset
df_reviews_amazon['cleaned_text'] = df_reviews_amazon['Text'].apply(clean_text)

df_reviews_amazon.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,cleaned_text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


#### Tokenization and lemmatization to standardize words

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to tokenize and lemmatize
def tokenize_and_lemmatize(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase before processing
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join back into a string
    return " ".join(lemmatized_tokens)

# Apply the function to the Text column
df_reviews_amazon['cleaned_text'] = df_reviews_amazon['Text'].apply(tokenize_and_lemmatize)
df_reviews_amazon.head()

### Feature Extraction

#### Use TF-IDF (Term Frequency-Inverse Document Frequency) to convert text into numerical features.
#### Alternatively, use pre-trained embeddings like Word2Vec, GloVe, or BERT embeddings for richer representations.

TF-IDF is a statistical measure to evaluate the importance of a word in a document relative to a collection of documents.

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text column
tfidf_matrix = tfidf_vectorizer.fit_transform(df_reviews_amazon['cleaned_text'])

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF feature DataFrame
print(tfidf_df)