In [36]:
import pandas as pd
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arca/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## DataFrame Preparation

In [37]:
# This file already has the "modcloth_final_data" processed. It is possible to input your own custom databases for sentiment analysis as well.

# Insert your dataframe (path) here:
dataframe = 'data/modcloth_final_data.json'

# Insert your text column here
textcolumn = "review_text"

In [38]:
mc_data = pd.read_json(dataframe, lines=True)
mc_data

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82785,807722,,8,4.0,b,,36.0,outerwear,,5ft 8in,Jennifer,just right,fit,727820,8.5,average,Cute jacket!,Cute jacket!
82786,807722,,12,5.0,ddd/f,,34.0,outerwear,,5ft 5in,Kelli,slightly long,small,197040,,,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,807722,,12,5.0,dddd/g,36.0,32.0,outerwear,,5ft 4in,elacount,just right,fit,102493,,,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,807722,,12,4.0,,,,outerwear,,5ft 3in,jennaklinner,just right,fit,756491,,,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


In [39]:
mc_data.reset_index(inplace=True)
mc_data

Unnamed: 0,index,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,0,123373,29.0,7,5.0,d,38.0,34.0,new,36,5ft 6in,Emily,just right,small,991571,,,,
1,1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82785,82785,807722,,8,4.0,b,,36.0,outerwear,,5ft 8in,Jennifer,just right,fit,727820,8.5,average,Cute jacket!,Cute jacket!
82786,82786,807722,,12,5.0,ddd/f,,34.0,outerwear,,5ft 5in,Kelli,slightly long,small,197040,,,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,82787,807722,,12,5.0,dddd/g,36.0,32.0,outerwear,,5ft 4in,elacount,just right,fit,102493,,,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,82788,807722,,12,4.0,,,,outerwear,,5ft 3in,jennaklinner,just right,fit,756491,,,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


In [40]:
#for demonstration purposes, the if-clause catches both "review_summary" and "review_text" of the dataframe, as review_sumamry is either a summary of the first 100 words of review_text.

if "review_summary" in mc_data.columns: 
    review_only = mc_data[["index","review_summary", textcolumn]]
else:
    review_only = mc_data[["index", textcolumn]]

review_only

Unnamed: 0,index,review_summary,review_text
0,0,,
1,1,,
2,2,,
3,3,,
4,4,,
...,...,...,...
82785,82785,Cute jacket!,Cute jacket!
82786,82786,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,82787,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,82788,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


In [41]:
review_text_only = mc_data[["index",textcolumn]]
review_text_cleaned = review_text_only.dropna()
review_text_cleaned

Unnamed: 0,index,review_text
6725,6725,"I liked the color, the silhouette, and the fab..."
6726,6726,From the other reviews it seems like this dres...
6727,6727,I love the design and fit of this dress! I wo...
6728,6728,I bought this dress for work it is flattering...
6729,6729,This is a very professional look. It is Great ...
...,...,...
82785,82785,Cute jacket!
82786,82786,It's a beautiful jacket. I love how it's knit ...
82787,82787,I love this blazer. It is a great office piece...
82788,82788,I love this blazer!! I wore it yesterday and g...


In [42]:
empty_review_counts = review_only.isna().sum()
empty_review_counts

index                0
review_summary    6725
review_text       6725
dtype: int64

In [43]:
cleaned_review_only = review_only.dropna()
cleaned_review_only

Unnamed: 0,index,review_summary,review_text
6725,6725,Too much ruching,"I liked the color, the silhouette, and the fab..."
6726,6726,Suits my body type!,From the other reviews it seems like this dres...
6727,6727,I love the design and fit,I love the design and fit of this dress! I wo...
6728,6728,Beautiful Dress!,I bought this dress for work it is flattering...
6729,6729,This is a very profession,This is a very professional look. It is Great ...
...,...,...,...
82785,82785,Cute jacket!,Cute jacket!
82786,82786,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,82787,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,82788,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


## Sentiment Analysis with VADER

In [44]:
sid = SentimentIntensityAnalyzer()

# Function to classify sentiment using VADER for 'review_summary'
# Note: it is normal if this block fails as 'review_summary' is not expected in all dataframes, and is only here to be able to compare review_summary and review_text

def classify_sentiment(review):
    scores = sid.polarity_scores(review)
    if scores['compound'] >= 0:
        return 'positive'
    elif scores['compound'] <= -0:
        return 'negative'

# Apply sentiment analysis to each review
cleaned_review_only['summary_sentiment'] = cleaned_review_only['review_summary'].apply(classify_sentiment)


print(cleaned_review_only[['review_summary', 'summary_sentiment']])

                  review_summary summary_sentiment
6725            Too much ruching          positive
6726         Suits my body type!          positive
6727   I love the design and fit          positive
6728            Beautiful Dress!          positive
6729   This is a very profession          positive
...                          ...               ...
82785               Cute jacket!          positive
82786   It's a beautiful jacket.          positive
82787  I love this blazer. It is          positive
82788  I love this blazer!! I wo          positive
82789  I love this piece. I'm re          positive

[76065 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_review_only['summary_sentiment'] = cleaned_review_only['review_summary'].apply(classify_sentiment)


In [45]:
# Apply sentiment analysis to the set textcolumn variable (by default it is review_text)
cleaned_review_only['predicted_vader_label'] = cleaned_review_only[textcolumn].apply(classify_sentiment)

print(cleaned_review_only[[textcolumn, 'predicted_vader_label']])

                                             review_text predicted_vader_label
6725   I liked the color, the silhouette, and the fab...              negative
6726   From the other reviews it seems like this dres...              positive
6727   I love the design and fit of this dress!  I wo...              negative
6728   I bought this dress for work  it is flattering...              positive
6729   This is a very professional look. It is Great ...              positive
...                                                  ...                   ...
82785                                       Cute jacket!              positive
82786  It's a beautiful jacket. I love how it's knit ...              positive
82787  I love this blazer. It is a great office piece...              positive
82788  I love this blazer!! I wore it yesterday and g...              positive
82789       I love this piece. I'm really happy with it!              positive

[76065 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_review_only['predicted_vader_label'] = cleaned_review_only[textcolumn].apply(classify_sentiment)


### Comparing review_summary and review_text from the Modcloth dataset

In [46]:
# Initialize counts
same = 0
different = 0

# Iterate over each row in the DataFrame and add counts appropiately
for index, row in cleaned_review_only.iterrows():
    if row['summary_sentiment'] == row['predicted_vader_label']:
        same += 1
    else:
        different += 1

# Print the counts
print("Number of rows where the values of summary_sentiment and text_sentiment are the same:", same)
print("Number of rows where the values of summary_sentiment and text_sentiment are different:", different)

Number of rows where the values of summary_sentiment and text_sentiment are the same: 70050
Number of rows where the values of summary_sentiment and text_sentiment are different: 6015


In [47]:
# Initialize counts dictionary
combination_dictionary = {}

# Iterate over each row in the DataFrame, get combination, and add to counts appropiately
for index, row in cleaned_review_only.iterrows():
    combination = (row['summary_sentiment'], row['predicted_vader_label'])
    
    if combination in combination_dictionary:
        combination_dictionary[combination] += 1
    else:
        combination_dictionary[combination] = 1


# Print result, first variable is from summary_sentiment, second variable is from text_sentiment
for combination, count in combination_dictionary.items():
    print("Combination:", combination, "count:", count)

Combination: ('positive', 'negative') count: 3699
Combination: ('positive', 'positive') count: 68746
Combination: ('negative', 'positive') count: 2316
Combination: ('negative', 'negative') count: 1304


## Sentiment Analysis with Latent Dirichlet Analysis (LDA)

In [48]:
# Convert text data into document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(review_text_cleaned[textcolumn])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(dtm)

# Get the dominant sentiment topic for each document
result = lda.transform(dtm)
review_text_cleaned['predicted_LDA_label'] = ['positive' if x[0] > x[1] else 'negative' for x in result]

print(review_text_cleaned)


       index                                        review_text  \
6725    6725  I liked the color, the silhouette, and the fab...   
6726    6726  From the other reviews it seems like this dres...   
6727    6727  I love the design and fit of this dress!  I wo...   
6728    6728  I bought this dress for work  it is flattering...   
6729    6729  This is a very professional look. It is Great ...   
...      ...                                                ...   
82785  82785                                       Cute jacket!   
82786  82786  It's a beautiful jacket. I love how it's knit ...   
82787  82787  I love this blazer. It is a great office piece...   
82788  82788  I love this blazer!! I wore it yesterday and g...   
82789  82789       I love this piece. I'm really happy with it!   

      predicted_LDA_label  
6725             negative  
6726             negative  
6727             positive  
6728             positive  
6729             positive  
...                   ...  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text_cleaned['predicted_LDA_label'] = ['positive' if x[0] > x[1] else 'negative' for x in result]


## Sentiment Analysis with K-means

In [49]:
# Convert text data into TF-IDF vectors
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(review_text_cleaned[textcolumn])

# Fit K-means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign predicted labels based on cluster centroids
review_text_cleaned['predicted_Kmeans_label'] = ['positive' if label == 0 else 'negative' for label in kmeans.labels_]

# Display the DataFrame with predicted labels
print(review_text_cleaned)


       index                                        review_text  \
6725    6725  I liked the color, the silhouette, and the fab...   
6726    6726  From the other reviews it seems like this dres...   
6727    6727  I love the design and fit of this dress!  I wo...   
6728    6728  I bought this dress for work  it is flattering...   
6729    6729  This is a very professional look. It is Great ...   
...      ...                                                ...   
82785  82785                                       Cute jacket!   
82786  82786  It's a beautiful jacket. I love how it's knit ...   
82787  82787  I love this blazer. It is a great office piece...   
82788  82788  I love this blazer!! I wore it yesterday and g...   
82789  82789       I love this piece. I'm really happy with it!   

      predicted_LDA_label predicted_Kmeans_label  
6725             negative               positive  
6726             negative               positive  
6727             positive               po

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text_cleaned['predicted_Kmeans_label'] = ['positive' if label == 0 else 'negative' for label in kmeans.labels_]


### Sentiment Analysis with a modified K-means model, and comparing their results

In [50]:
## K-means tuned

# Define K-means with adjusted parameters
kmeans = KMeans(n_clusters=2, 
                init='k-means++',    # Initialization method
                max_iter=300,        # Maximum number of iterations
                random_state=42)     # Random seed for reproducibility

# Fit K-means clustering
kmeans.fit(tfidf_matrix)

# Assign predicted labels based on cluster centroids
review_text_cleaned['predicted_Kmeans_tuned_label'] = ['positive' if label == 0 else 'negative' for label in kmeans.labels_]

print(review_text_cleaned)

       index                                        review_text  \
6725    6725  I liked the color, the silhouette, and the fab...   
6726    6726  From the other reviews it seems like this dres...   
6727    6727  I love the design and fit of this dress!  I wo...   
6728    6728  I bought this dress for work  it is flattering...   
6729    6729  This is a very professional look. It is Great ...   
...      ...                                                ...   
82785  82785                                       Cute jacket!   
82786  82786  It's a beautiful jacket. I love how it's knit ...   
82787  82787  I love this blazer. It is a great office piece...   
82788  82788  I love this blazer!! I wore it yesterday and g...   
82789  82789       I love this piece. I'm really happy with it!   

      predicted_LDA_label predicted_Kmeans_label predicted_Kmeans_tuned_label  
6725             negative               positive                     positive  
6726             negative          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text_cleaned['predicted_Kmeans_tuned_label'] = ['positive' if label == 0 else 'negative' for label in kmeans.labels_]


In [51]:
# Comparison of the models

# Count and identify columns with same value
comparison_df = review_text_cleaned[["predicted_Kmeans_label","predicted_Kmeans_tuned_label"]]
# Get value counts for each column
value_counts_label = comparison_df['predicted_Kmeans_label'].value_counts()
value_counts_tuned_label = comparison_df['predicted_Kmeans_tuned_label'].value_counts()

# Check if the value counts are equal
if value_counts_label.equals(value_counts_tuned_label):
    print("Both columns have the same values.")
    print(value_counts_label)
else:
    print("Columns have different values.")

Both columns have the same values.
predicted_Kmeans_label
positive    73353
negative     2712
Name: count, dtype: int64


## Finalizing the dataset for use

In [52]:
merged_df = review_text_cleaned.merge(cleaned_review_only[['index', 'predicted_vader_label']], on='index', how='left')
merged_df

Unnamed: 0,index,review_text,predicted_LDA_label,predicted_Kmeans_label,predicted_Kmeans_tuned_label,predicted_vader_label
0,6725,"I liked the color, the silhouette, and the fab...",negative,positive,positive,negative
1,6726,From the other reviews it seems like this dres...,negative,positive,positive,positive
2,6727,I love the design and fit of this dress! I wo...,positive,positive,positive,negative
3,6728,I bought this dress for work it is flattering...,positive,positive,positive,positive
4,6729,This is a very professional look. It is Great ...,positive,positive,positive,positive
...,...,...,...,...,...,...
76060,82785,Cute jacket!,negative,positive,positive,positive
76061,82786,It's a beautiful jacket. I love how it's knit ...,positive,positive,positive,positive
76062,82787,I love this blazer. It is a great office piece...,positive,positive,positive,positive
76063,82788,I love this blazer!! I wore it yesterday and g...,positive,positive,positive,positive


In [53]:
merged_df.to_csv('predicted_reviews.csv', index=False)