In [None]:
!pip install pandas transformers scikit-learn




In [None]:
import pandas as pd

df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

print(df.info())

print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB
None
   Unnamed: 0  Clothing ID  Age                    Title  \
0           0          767   33                      NaN   
1           1         1080 

In [None]:

df = df.dropna(subset=['Review Text'])


df['Division Name'] = df['Division Name'].fillna('Unknown')
df['Department Name'] = df['Department Name'].fillna('Unknown')
df['Class Name'] = df['Class Name'].fillna('Unknown')


print(df.isnull().sum())


Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      2966
Review Text                   0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                 0
Department Name               0
Class Name                    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Division Name'] = df['Division Name'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Department Name'] = df['Department Name'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Class Name'] = df['Class Name'].fillna('Unknown')


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)


df['Cleaned Review'] = df['Review Text'].apply(clean_text)


print(df[['Review Text', 'Cleaned Review']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                         Review Text  \
0  Absolutely wonderful - silky and sexy and comf...   
1  Love this dress!  it's sooo pretty.  i happene...   
2  I had such high hopes for this dress and reall...   
3  I love, love, love this jumpsuit. it's fun, fl...   
4  This shirt is very flattering to all due to th...   

                                      Cleaned Review  
0        absolutely wonderful silky sexy comfortable  
1  love dress sooo pretty happened find store im ...  
2  high hope dress really wanted work initially o...  
3  love love love jumpsuit fun flirty fabulous ev...  
4  shirt flattering due adjustable front tie perf...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x >= 4 else 'negative')


tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned Review'])
y = df['Sentiment']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.83      0.60      0.69      1045
    positive       0.89      0.96      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.86      0.78      0.81      4529
weighted avg       0.87      0.88      0.87      4529



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X, X)

def recommend_products(clothing_id, num_recommendations=5):
    idx = df[df['Clothing ID'] == clothing_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


    recommendations = [i[0] for i in similarity_scores[1:num_recommendations+1]]
    return df.iloc[recommendations][['Clothing ID', 'Review Text']]


recommended = recommend_products(123)
print(recommended)


       Clothing ID                                        Review Text
16987         1056  Wish they had more colors in these pants. the ...
15970         1056  I looooove these pants. i was looking for a co...
1922          1066  The retailer store had mostly white left of th...
8171          1066  There wasn't much question as to whether or no...
19767         1062  First thing i would say these pants run large ...


In [None]:

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


new_review = "I love this dress! The quality is amazing and the fit is perfect."
new_review_cleaned = clean_text(new_review)
new_review_vectorized = tfidf.transform([new_review_cleaned])
predicted_sentiment = model.predict(new_review_vectorized)[0]
print(f"Predicted Sentiment for the review: {predicted_sentiment}")


              precision    recall  f1-score   support

    negative       0.83      0.60      0.69      1045
    positive       0.89      0.96      0.92      3484

    accuracy                           0.88      4529
   macro avg       0.86      0.78      0.81      4529
weighted avg       0.87      0.88      0.87      4529

Predicted Sentiment for the review: positive
