In [32]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [33]:
def read_data():
    return pd.read_csv("amazon_reviews.tsv", sep="\t", header=0, on_bad_lines='skip', dtype={
        "marketplace": str,
        "customer_id":str,
        "review_id":str,
        "product_id":str,
        "product_parent":str,
        "product_title":str,
        "product_category":str,
        "star_rating":int,
        "helpful_votes":int,
        "total_votes":int,
        "vine":str,
        "verified_purchase":str,
        "review_headline":str,
        "review_body":str,
        "review_date":str
    })

data = read_data()

In [34]:
data

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,42746566,R1YK0WEHRU8FIZ,B00ALHZKS4,619458452,Jawbone Jambox Wireless Bluetooth Speaker - Bl...,Mobile_Electronics,5,0,0,N,Y,Good thing in a small package,I bought this as a gift for my son and he real...,3/9/13
1,US,19138768,RVT9KKPRIJQF6,B004N9446A,934947764,Green Silicone skin sport nano case for Apple ...,Mobile_Electronics,3,6,6,N,N,Good enough,I like this case as it is secure and I can han...,10/14/11
2,US,47124588,R379DZOJJ7AU4B,B00HBIA3LA,231171126,original brand new proscan TV remote control F...,Electronics,1,0,0,N,Y,... it to even though it was advertised to wor...,It showed up on time but the remote did not pr...,8/7/15
3,US,42442638,R1ZVJHCW5R9K1R,B00VZ4LQMQ,499379777,Bluetooth Headphones - Fit Acoustics Wireless ...,Electronics,5,1,1,N,Y,Excellent product excellent customer service,Excellent product excellent customer service. ...,6/10/15
4,US,28964420,R2N5NR7X0Y5GNC,B004HXA9JS,345900866,TsirTech 18 Items Luxury Accessory Bundle for ...,Mobile_Electronics,5,0,0,N,Y,"great bundle, love the texting glove,","Great bundle,everything works,very satisfied w...",1/16/11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99904,US,32577676,RG7205OK478AH,B002LTQFUM,474157700,DBTech 175 Watt Portable Micro Power Inverter ...,Mobile_Electronics,1,4,4,N,Y,NOT GOOD for Older cars. The tube is TOO BIG!,After a few major thunderstorms and electrical...,12/22/11
99905,US,42656096,R6NWGS6WOHDBR,B00J470LTA,95759985,Sony US18650VTC4 18650 VTC4 2100mAh 30A Rechar...,Electronics,1,1,1,N,Y,Waste of money,It doesn't hold a charge for more than an hour...,8/4/15
99906,US,18189611,R1KTD2TORH8TT5,B0057YC1L0,208272544,Pyramid - 4'' X 10'' 300 Watts Three-Way Speak...,Mobile_Electronics,4,0,0,N,Y,Pyramid - 4'' X 10'',"Their Pyramids not the best, but far from the ...",4/17/13
99907,US,36312341,R3QA3QLQSWI2IJ,B003L1ZYYM,617978254,AmazonBasics High-Speed HDMI Cable - 6.5 Feet ...,Electronics,5,0,0,N,Y,I love FireTV.,This is number three.. I love FireTV.,8/1/14


In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.sparse import hstack

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Fill missing values in the relevant columns
data['product_title'] = data['product_title'].fillna('')
data['product_category'] = data['product_category'].fillna('')
data['review_body'] = data['review_body'].fillna('')

# Perform basic NLP preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

data['processed_review_body'] = data['review_body'].apply(preprocess_text)

# Sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_vader(review):
    score = analyzer.polarity_scores(review)
    return score['compound']

data['sentiment_score'] = data['processed_review_body'].apply(get_sentiment_vader)

print(data['sentiment_score'])


[nltk_data] Downloading package punkt to /Users/aaayush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aaayush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0        0.8906
1        0.2709
2        0.2023
3        0.9001
4        0.8481
          ...  
99904    0.7767
99905   -0.4767
99906   -0.1280
99907    0.6705
99908    0.9892
Name: sentiment_score, Length: 99909, dtype: float64


In [36]:
# Combine features to create a single feature for recommendation
data['combined_text'] = data['product_title'] + " " + data['product_category'] + " " + data['processed_review_body']

# Aggregate data by product_id
aggregated_data = data.groupby('product_id').agg({
    'combined_text': ' '.join,
    'sentiment_score': 'mean',
    'star_rating': 'mean'
}).reset_index()

# Using TF-IDF Vectorizer to convert text data into vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(aggregated_data['combined_text'])

# Combine TF-IDF vectors with sentiment scores
sentiment_scores = aggregated_data['sentiment_score'].values.reshape(-1, 1)
sentiment_scores_sparse = csr_matrix(sentiment_scores)  # Convert to sparse matrix
combined_features = hstack([csr_matrix(tfidf_matrix), sentiment_scores_sparse])

# Calculating the cosine similarity matrix on the combined features
cosine_sim = cosine_similarity(combined_features, combined_features)



In [53]:
# Creating a function to get product recommendations
def get_user_recommendations(user_id, cosine_sim=cosine_sim):
    user_data = data[data['customer_id'] == user_id]
    if user_data.empty:
        return "No data available for this user."
    
    user_product_ids = user_data['product_id'].unique()
    
    sim_scores = []
    for product_id in user_product_ids:
        idx = aggregated_data[aggregated_data['product_id'] == product_id].index[0]
        sim_scores.extend(list(enumerate(cosine_sim[idx])))
    
    # Sort and get top 10 recommendations
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:10]  # Get top 10 recommendations
    product_indices = [i[0] for i in sim_scores]
    
    recommended_products = aggregated_data.iloc[product_indices]
    recommended_products = recommended_products[~recommended_products['product_id'].isin(user_product_ids)]
    
    return recommended_products


print(get_user_recommendations(data['customer_id'].iloc[0]))

       product_id                                      combined_text  \
27081  B00BI789J8  Jawbone Jambox Wireless Bluetooth Speaker Blac...   
34487  B00M2849C8  Jawbone JAMBOX Wireless Bluetooth Speaker (Bla...   
36018  B00PGQ5F1C  MANIFEST Portable Wireless Bluetooth Speaker w...   
37659  B012EDG356  Bluetooth Wireless Speaker(Blue) Mobile_Electr...   
8403   B001PNPOBQ  Black Bluetooth® Jawbone® II Headset Electroni...   
25364  B009ZIINMU  Protective Travel Carrying Case For Big Jawbon...   
34027  B00L3KW09K  JAM Street Rugged Portable Speaker Electronics...   
36049  B00PJ36URA  Jawbone Mini Jambox Wireless Speaker (Certifie...   
19229  B005KQ2O26  Logitech Bluetooth Wireless Speaker Electronic...   

       sentiment_score  star_rating  
27081         0.963600     5.000000  
34487         0.396209     3.454545  
36018         0.986100     5.000000  
37659         0.877883     5.000000  
8403          0.913100     2.000000  
25364         0.476714     4.285714  
34027        

In [54]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise import accuracy

# Load the dataset
# Assuming data is already loaded and preprocessed in variable `data`

# Fill missing values
data['star_rating'] = data['star_rating'].fillna(0)

# Aggregating the data by customer_id and product_id
data_aggregated = data.groupby(['customer_id', 'product_id'])['star_rating'].mean().reset_index()

# Define a reader with rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data into the Surprise dataset
surprise_data = Dataset.load_from_df(data_aggregated[['customer_id', 'product_id', 'star_rating']], reader)

# Build the full trainset
trainset = surprise_data.build_full_trainset()

# Use SVD algorithm
svd = SVD()

# Train the algorithm on the full trainset
svd.fit(trainset)

# Function to get product recommendations for a specific customer_id
def get_recommendations_for_user(customer_id, svd, data_aggregated, n_recommendations=10):
    # Get the list of all product IDs
    all_product_ids = data_aggregated['product_id'].unique()
    
    # Get the list of products the user has already rated
    rated_products = data_aggregated[data_aggregated['customer_id'] == customer_id]['product_id'].unique()
    
    # Predict ratings for all products the user has not rated yet
    recommendations = []
    for product_id in all_product_ids:
        if product_id not in rated_products:
            predicted_rating = svd.predict(customer_id, product_id).est
            recommendations.append((product_id, predicted_rating))
    
    # Sort the predictions by rating in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    # Return the top N recommendations
    return recommendations[:n_recommendations]

# Example usage
customer_id = data['customer_id'].iloc[0]  # Replace with the actual customer_id you want recommendations for
recommendations = get_recommendations_for_user(customer_id, svd, data_aggregated, n_recommendations=10)
print(f"Top 10 recommendations for customer {customer_id}:")
for product_id, rating in recommendations:
    print(f"Product ID: {product_id}, Predicted Rating: {rating:.2f}")

Top 10 recommendations for customer 42746566:
Product ID: B00BEWF4R2, Predicted Rating: 5.00
Product ID: B00J46XO9U, Predicted Rating: 5.00
Product ID: B0052SCU8U, Predicted Rating: 4.95
Product ID: B000092TT0, Predicted Rating: 4.95
Product ID: B00172WYWM, Predicted Rating: 4.94
Product ID: B004LTEUDO, Predicted Rating: 4.94
Product ID: B004MKNJCU, Predicted Rating: 4.92
Product ID: B00B2HTMMW, Predicted Rating: 4.91
Product ID: B004JRYLG4, Predicted Rating: 4.89
Product ID: B00478O0JI, Predicted Rating: 4.89


In [56]:
def get_combined_recommendations(user_id, svd, data_aggregated, cosine_sim, n_recommendations=10):
    # Get recommendations from collaborative filtering
    cf_recommendations = get_recommendations_for_user(user_id, svd, data_aggregated, n_recommendations)
    cf_recommendation_ids = [rec[0] for rec in cf_recommendations]
    
    # Get recommendations from content-based filtering
    cb_recommendations = get_user_recommendations(user_id, cosine_sim)
    if isinstance(cb_recommendations, str):  # Handle case where no data is available for the user
        cb_recommendation_ids = []
    else:
        cb_recommendation_ids = cb_recommendations['product_id'].tolist()
    
    # Combine and get unique recommendations
    unique_recommendation_ids = list(set(cf_recommendation_ids + cb_recommendation_ids))
    
    # Limit the recommendations to the desired number
    unique_recommendation_ids = unique_recommendation_ids[:n_recommendations]
    
    return unique_recommendation_ids

In [57]:
# Example usage
customer_id = data['customer_id'].iloc[0]  # Replace with the actual customer_id you want recommendations for
combined_recommendations = get_combined_recommendations(customer_id, svd, data_aggregated, cosine_sim, n_recommendations=10)
print(f"Top 10 combined recommendations for customer {customer_id}:")
for product_id in combined_recommendations:
    print(f"Product ID: {product_id}")

Top 10 combined recommendations for customer 42746566:
Product ID: B001PNPOBQ
Product ID: B004LTEUDO
Product ID: B00BI789J8
Product ID: B000092TT0
Product ID: B0052SCU8U
Product ID: B00PJ36URA
Product ID: B00B2HTMMW
Product ID: B00M2849C8
Product ID: B009ZIINMU
Product ID: B004JRYLG4


In [58]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from math import sqrt

# Define a function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Aggregate train and test data by product_id
train_aggregated = train_data.groupby('product_id').agg({
    'combined_text': ' '.join,
    'sentiment_score': 'mean',
    'star_rating': 'mean'
}).reset_index()

test_aggregated = test_data.groupby('product_id').agg({
    'combined_text': ' '.join,
    'sentiment_score': 'mean',
    'star_rating': 'mean'
}).reset_index()

# Using TF-IDF Vectorizer to convert text data into vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_aggregated['combined_text'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_aggregated['combined_text'])

# Model 1: Using only TF-IDF vectors
cosine_sim_tfidf_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
cosine_sim_tfidf_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)

# Model 2: Using TF-IDF vectors combined with sentiment scores
sentiment_scores_train = train_aggregated['sentiment_score'].values.reshape(-1, 1)
sentiment_scores_test = test_aggregated['sentiment_score'].values.reshape(-1, 1)
sentiment_scores_train_sparse = csr_matrix(sentiment_scores_train)
sentiment_scores_test_sparse = csr_matrix(sentiment_scores_test)

combined_features_train = hstack([csr_matrix(tfidf_matrix_train), sentiment_scores_train_sparse])
combined_features_test = hstack([csr_matrix(tfidf_matrix_test), sentiment_scores_test_sparse])

cosine_sim_combined_train = cosine_similarity(combined_features_train, combined_features_train)
cosine_sim_combined_test = cosine_similarity(combined_features_test, combined_features_train)

# Predict ratings for test data using both models
def predict_ratings(test_aggregated, train_aggregated, cosine_sim):
    predicted_ratings = []
    for idx, row in test_aggregated.iterrows():
        similar_indices = cosine_sim[idx].argsort()[::-1][1:6]  # Top 5 similar items
        similar_ratings = train_aggregated.iloc[similar_indices]['star_rating']
        predicted_ratings.append(similar_ratings.mean())
    return np.array(predicted_ratings)

predicted_ratings_tfidf = predict_ratings(test_aggregated, train_aggregated, cosine_sim_tfidf_test)
predicted_ratings_combined = predict_ratings(test_aggregated, train_aggregated, cosine_sim_combined_test)

# Calculate RMSE for both models
rmse_tfidf = calculate_rmse(test_aggregated['star_rating'], predicted_ratings_tfidf)
rmse_combined = calculate_rmse(test_aggregated['star_rating'], predicted_ratings_combined)

print(f"RMSE (TF-IDF only): {rmse_tfidf}")
print(f"RMSE (TF-IDF + Sentiment): {rmse_combined}")


RMSE (TF-IDF only): 1.3642706735544725
RMSE (TF-IDF + Sentiment): 1.244985916086346


In [61]:
improvement_percentage = ((rmse_tfidf - rmse_combined) / rmse_tfidf) * 100

print(f"Improvement Percentage: {improvement_percentage:.2f}%")

Improvement Percentage: 8.74%
