In [2]:
import pandas as pd
import re

# Load data
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

# Inspect data
print("Training Data Sample:")
print(train_data.head())
print("\nTesting Data Sample:")
print(test_data.head())

# Check data structure
print("\nTraining Data Info:")
print(train_data.info())
print("\nTesting Data Info:")
print(test_data.info())

# Access reviews and sentiments
reviews = train_data['reviews']
sentiments = train_data['sentiments']
test_reviews = test_data['reviews']

Training Data Sample:
                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1

Testing Data Sample:
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7401 entries, 0 to 7400
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviews     7401 non-null   object
 

In [3]:
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabet characters
    return text

train_data['cleaned_reviews'] = train_data['reviews'].apply(clean_text)
test_data['cleaned_reviews'] = test_data['reviews'].apply(clean_text)
print(train_data[['reviews', 'cleaned_reviews']].head())
print(test_data[['reviews', 'cleaned_reviews']].head())

                                             reviews  \
0  I bought this belt for my daughter in-law for ...   
1  The size was perfect and so was the color.  It...   
2  Fits and feels good, esp. for doing a swim rac...   
3  These socks are absolutely the best. I take pi...   
4  Thank you so much for the speedy delivery they...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                             reviews  \
0  I bought 2 sleepers.  sleeper had holes in the...   
1  I dare say these are just about the sexiest th...   
2  everything about the transaction (price, deliv...   
3  Not bad for just a shirt.  Very durable, and m...   
4  These are truly wrinkle free and longer than t... 

In [4]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # Download tokenizer

train_data['tokenized_reviews'] = train_data['cleaned_reviews'].apply(word_tokenize)
test_data['tokenized_reviews'] = test_data['cleaned_reviews'].apply(word_tokenize)
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                   tokenized_reviews  \
0  [i, bought, this, belt, for, my, daughter, inl...   
1  [the, size, was, perfect, and, so, was, the, c...   
2  [fits, and, feels, good, esp, for, doing, a, s...   
3  [these, socks, are, absolutely, the, best, i, ...   
4  [thank, you, so, much, for, the, speedy, deliv...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [i, bought, sleepers, sleeper, had, holes, in,...   
1  [i, dare, say, these, are, just, about, the, s...   
2  [everything, about, the, transaction, price, d...   
3  [not, bad, for, just, a, shirt, very, durable,...   
4  [these, are, truly, wrinkle, free, and, longer... 

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

train_data['tokenized_reviews'] = train_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
test_data['tokenized_reviews'] = test_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darre\AppData\Roaming\nltk_data...


                                   tokenized_reviews  \
0  [bought, belt, daughter, inlaw, christmas, loved]   
1    [size, perfect, color, looked, like, web, page]   
2  [fits, feels, good, esp, swim, race, highly, r...   
3  [socks, absolutely, best, take, pilates, class...   
4  [thank, much, speedy, delivery, came, time, re...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [bought, sleepers, sleeper, holes, arm, pit, a...   
1  [dare, say, sexiest, things, ive, ever, worn, ...   
2  [everything, transaction, price, delivery, tim...   
3  [bad, shirt, durable, matched, teams, colors, ...   
4  [truly, wrinkle, free, longer, average, womans... 

[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources (if you haven't already)
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Lemmatize each word and join them back into a single string
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemmatized_words)

# Apply the lemmatization function to the 'reviews' column
train_data['lemmatized_reviews'] = train_data['tokenized_reviews'].apply(lemmatize_text)
test_data['lemmatized_reviews'] = test_data['tokenized_reviews'].apply(lemmatize_text)

# Check the results
print(train_data[['tokenized_reviews', 'lemmatized_reviews']])
print(test_data[['tokenized_reviews', 'lemmatized_reviews']])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                      tokenized_reviews  \
0     [bought, belt, daughter, inlaw, christmas, loved]   
1       [size, perfect, color, looked, like, web, page]   
2     [fits, feels, good, esp, swim, race, highly, r...   
3     [socks, absolutely, best, take, pilates, class...   
4     [thank, much, speedy, delivery, came, time, re...   
...                                                 ...   
7396  [bought, shirts, black, medium, wear, daily, b...   
7397  [first, thought, scarf, might, good, quality, ...   
7398  [picky, comes, bras, want, something, support,...   
7399  [jacket, wind, water, resistant, waterproof, s...   
7400  [extremely, confortable, material, soft, cotto...   

                                     lemmatized_reviews  
0            bought belt daughter inlaw christmas loved  
1               size perfect color looked like web page  
2     fit feel good esp swim race highly recommend c...  
3     sock absolutely best take pilate class hot foo...  
4

In [1]:
# use train_data['lemmatized_reviews'] and test_data['lemmatized_reviews'] for model training & prediction