#### Import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
file_path = '../data/Dataset-SA.csv'
# Load the dataset
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print(f"Initail shape of the dataset: {df.shape}")
    print("\nInitial Columns:")
    print(df.columns)
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")

# print(f"Initail shape of the dataset: {df.shape}")
# print("\nInitial Columns:")
# print(df.columns)

Dataset loaded successfully.
Initail shape of the dataset: (205052, 6)

Initial Columns:
Index(['product_name', 'product_price', 'Rate', 'Review', 'Summary',
       'Sentiment'],
      dtype='object')


#### Rename Columns and Handle Missing Data

In [3]:
df = df.rename(columns={'Review': 'review_text', 'Sentiment': 'Sentiment_label'})
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
product_name           0
product_price          0
Rate                   0
review_text        24664
Summary               11
Sentiment_label        0
dtype: int64


In [4]:
df.dropna(subset=['review_text', 'Summary'], inplace=True)

In [6]:
print(f"\nShape after dropping NaNs: {df.shape}")
print("\nFinal missing values check:")
print(df.isnull().sum())


Shape after dropping NaNs: (180379, 6)

Final missing values check:
product_name       0
product_price      0
Rate               0
review_text        0
Summary            0
Sentiment_label    0
dtype: int64


In [7]:
print(df[['review_text', 'Summary']].head())

       review_text                                            Summary
0           super!  great cooler excellent air flow and for this p...
1          awesome              best budget 2 fit cooler nice cooling
2             fair  the quality is good but the power of air is de...
3  useless product                  very bad product its a only a fan
4             fair                                      ok ok product


In [8]:
print("\nSentiment label distribution(target variable):")
label_counts = df['Sentiment_label'].value_counts()
print(label_counts)



Sentiment label distribution(target variable):
Sentiment_label
positive    147171
negative     24401
neutral       8807
Name: count, dtype: int64


In [11]:
total_count = label_counts.sum()
print("\nDistribution Percentage:")
print((label_counts / total_count) * 100)
# for label, count in label_counts.items():
#     percentage = (count / total_count) * 100
#     print(f"Label: {label}, Count: {count}, Percentage: {percentage:.2f}%")


Distribution Percentage:
Sentiment_label
positive    81.589875
negative    13.527628
neutral      4.882497
Name: count, dtype: float64


#### Phase 2 - Text Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords

In [16]:
nltk.download('stopwords') 
print("NLTK 'stopwords' downloaded successfully.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...


NLTK 'stopwords' downloaded successfully.


[nltk_data]   Unzipping corpora\stopwords.zip.


In [17]:
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords') 

STOPWORDS = set(stopwords.words('english'))

In [20]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ',text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    text = text.strip()
    return text


In [23]:
# Apply cleaning function
df['cleaned_text'] = df['review_text'].apply(clean_text)
# show effect of cleaning
print("\nRaw Text vs. Cleaned Text:")
print(df[['review_text', 'cleaned_text']].head())
print("----------------------")
print(df[['review_text', 'cleaned_text']].tail())


Raw Text vs. Cleaned Text:
       review_text     cleaned_text
0           super!            super
1          awesome          awesome
2             fair             fair
3  useless product  useless product
4             fair             fair
----------------------
            review_text cleaned_text
205047        must buy!     must buy
205048           super!        super
205049             nice         nice
205050        just wow!          wow
205051  value-for-money  value money


#### Phase 3: Feature Engineering

##### Define X (features) and y (target)

In [24]:
X = df['cleaned_text']
y = df['Sentiment_label']

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=44, stratify=y
)

In [26]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (144303,)
X_test shape: (36076,)


##### TF-IDF Vectorization

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_vectrorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

#### Exporting and save variable by joblib

In [28]:
import joblib
from scipy.sparse import save_npz

In [29]:
# --- Saving the Vectorized Data ---
joblib.dump(y_train, '../model/y_train.joblib')
joblib.dump(y_test, '../model/y_test.joblib')
joblib.dump(vectorizer, '../model/vectorizer.joblib')


['../model/vectorizer.joblib']

In [30]:
save_npz('../model/X_train_vectorized.npz', X_train_vectrorized)
save_npz('../model/X_test_vectorized.npz', X_test_vectorized)