In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load data
data = pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Electronics_v1_00.tsv",sep='\t',on_bad_lines='skip')

In [27]:
data = data.sample(1000)

In [42]:
data = data.reset_index(drop=True)
data

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,processed_title,processed_review
0,US,15609489,RHGPDF18ORCP9,B005QVY2YS,137224738,eforCity 406110 HDMI to 3-RCA Male Video Compo...,Electronics,1.00,0.000000,0.000000,N,Y,excellent product,This excellent product I brought to QUITO-ECUA...,2012-12-14,eforc hdmi rca male video compon cabl hdtv,excel product brought quitoecuador south ameri...
1,US,46799215,RHEOA0CR068J,B009WA2NKY,348306861,BlueRigger DVI Male to DVI Male Digital Dual-L...,Electronics,1.00,0.000000,0.000000,N,Y,Nicely Done,Great product. Worked exactly as expected.<br ...,2014-03-17,bluerigg dvi male dvi male digit duallink cabl,great product work exactli expectedbr br need ...
2,US,48501625,R3N3T8P6W7FVKY,B003CK8TZ8,509448955,Griffin Technology GC17096 Mini DisplayPort to...,Electronics,1.00,0.004630,0.004329,N,N,Perfect,I bought this to use with a late 2009 iMac (24...,2010-05-15,griffin technolog gc mini displayport hdmi dvi...,bought use late imac pass video audio pass aud...
3,US,47940339,RUBC3VZ85L30A,B0002861MG,646687419,"Da-Lite 40932 Wall Mount Brackets, Pack of 2 #...",Electronics,1.00,0.000000,0.000000,N,Y,Easy to create,It was very easy to put together you put one w...,2014-09-02,dalit wall mount bracket pack l mount extens b...,easi put togeth put one washer wool anoth top ...
4,US,48911947,R1EI772NYCKFLL,B007T0W5CA,182221938,RCA RCD30 Alarm Clock with LED display,Electronics,0.75,0.000000,0.000000,N,Y,"Basic Clock, a little smaller than expected",This is a basic clock and works fine. When I ...,2013-10-02,rca rcd alarm clock led display,basic clock work fine receiv littl smaller exp...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,US,21547372,R2X7WWKK0XUJHR,B000J14JG6,898350267,"SportBand with case for iPod Video, Microsoft ...",Electronics,0.75,0.000000,0.000000,N,N,Not intended for NANO's,"The product is very nice, but advertized as be...",2007-11-21,sportband case ipod video microsoft zune creat...,product nice advert nano true big nano bounc a...
996,US,16095785,R26P4RUF7R3ZJ,B000MAR0EC,334059333,(3 Kit) Leather Case Car Charger USB Cable for...,Electronics,0.00,0.000000,0.000000,N,N,(3 Kit) Leather Case Car Charger USB Cable for...,Ordered it for the leather case - which is not...,2007-07-18,kit leather case car charger usb cabl lg cu,order leather case realli leather case went ba...
997,US,22983395,R2ZH356JA70RDU,B00BBG0V06,646671040,"SanDisk Sansa Fuze+ 4GB MP3 Player with 2.4"" L...",Electronics,0.75,0.000000,0.000000,N,Y,Sansa MP3 - Refurbished,Really good product for the price! Prime is th...,2013-08-12,sandisk sansa fuze gb mp player lcd screen tou...,realli good product price prime way go mp orde...
998,US,44102524,R3BCWXS2S8Z4ND,B009A6CZ26,951356451,"Sony MDR-IF245RK Wireless IF Headphone,",Electronics,1.00,0.009259,0.008658,N,Y,Sony MDR-IF 245RK WirelessIF Headphone is a gr...,"If you have a person who is hard of hearing, t...",2015-04-06,soni mdrifrk wireless headphon,person hard hear fantast buy would go crazi li...


In [43]:
def preprocess_text(text):
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower() 
    words = text.split()  
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [44]:
data['processed_title'] = data['product_title'].apply(preprocess_text)
data['processed_review'] = data['review_body'].apply(preprocess_text)

In [45]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
review_tfidf = tfidf_vectorizer.fit_transform(data['review_body'])

In [46]:
scaler = MinMaxScaler()
data[['star_rating', 'helpful_votes', 'total_votes']] = scaler.fit_transform(
    data[['star_rating', 'helpful_votes', 'total_votes']]
)

In [47]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, random_state=42)  # 10 topics
lda_features = lda.fit_transform(review_tfidf)

In [48]:
lda_features

array([[0.02403888, 0.02403896, 0.02403803, ..., 0.02403832, 0.02403781,
        0.02403955],
       [0.01843361, 0.01843544, 0.01843608, ..., 0.83408882, 0.01843377,
        0.01843562],
       [0.01500527, 0.01501389, 0.01501053, ..., 0.01501972, 0.0150049 ,
        0.01502172],
       ...,
       [0.02126232, 0.02127057, 0.02126685, ..., 0.02127122, 0.02127159,
        0.02126807],
       [0.02219486, 0.02219468, 0.02219917, ..., 0.02220176, 0.02219498,
        0.02219492],
       [0.0283684 , 0.02836772, 0.02836812, ..., 0.02836966, 0.02836606,
        0.0283666 ]])

In [49]:
from scipy.sparse import csr_matrix
user_ids = data['customer_id'].astype('category').cat.codes
item_ids = data['product_id'].astype('category').cat.codes

user_mapping = dict(enumerate(data['customer_id'].astype('category').cat.categories))
item_mapping = dict(enumerate(data['product_id'].astype('category').cat.categories))

interaction_matrix = csr_matrix((data['star_rating'], (user_ids, item_ids)))
interaction_matrix_dense = interaction_matrix.toarray()


In [50]:
from sklearn.decomposition import TruncatedSVD

# Perform SVD for matrix factorization
n_factors = 10
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_factors = svd.fit_transform(interaction_matrix)
item_factors = svd.components_.T                      


In [51]:
from scipy.special import softmax


item_factors_transformed = softmax(item_factors, axis=1)

def alignment_loss(latent_factors, lda_topics):
    return np.sum((latent_factors - lda_topics) ** 2)


In [52]:
# Map LDA topic distributions to items by averaging per product_id
unique_item_ids = data['product_id'].unique()
aggregated_lda = []

In [53]:
unique_item_ids = data['product_id'].unique()

aggregated_lda = []

for item_id in unique_item_ids:
    indices = data[data['product_id'] == item_id].index.tolist()
    
    aggregated_lda.append(lda_features[indices].mean(axis=0))

aggregated_lda = np.array(aggregated_lda)

# Check the dimensions
print("Shape of item_factors:", item_factors.shape)
print("Shape of aggregated_lda:", aggregated_lda.shape)


Shape of item_factors: (911, 10)
Shape of aggregated_lda: (911, 10)


In [54]:
n_iterations = 1000
observed = interaction_matrix_dense > 0
# Adam parameters
beta1, beta2 = 0.9, 0.999  # Decay rates for moment estimates
epsilon = 1e-8  # Small value to prevent division by zero

# Initialize Adam moment vectors for user and item factors
m_user, v_user = np.zeros_like(user_factors), np.zeros_like(user_factors)
m_item, v_item = np.zeros_like(item_factors), np.zeros_like(item_factors)

# Adam Gradient Descent Loop
for iteration in range(n_iterations):
    # Predict ratings
    prediction = user_factors @ item_factors.T

    # Compute gradients
    error = (interaction_matrix_dense - prediction) * observed  # Masked error
    grad_user_factors = -error @ item_factors + 2 * user_factors  # Add regularization gradient
    grad_item_factors = -error.T @ user_factors + 2 * item_factors

    # Alignment gradient
    exp_factors = np.exp(item_factors - np.max(item_factors, axis=1, keepdims=True))
    softmax_item_factors = exp_factors / exp_factors.sum(axis=1, keepdims=True)
    alignment_grad_item_factors = 2 * (softmax_item_factors - aggregated_lda)
    grad_item_factors += 0.1 * alignment_grad_item_factors

    # Adam updates for user_factors
    m_user = beta1 * m_user + (1 - beta1) * grad_user_factors
    v_user = beta2 * v_user + (1 - beta2) * (grad_user_factors ** 2)
    m_user_hat = m_user / (1 - beta1 ** (iteration + 1))  # Bias correction
    v_user_hat = v_user / (1 - beta2 ** (iteration + 1))  # Bias correction
    user_factors -= learning_rate * m_user_hat / (np.sqrt(v_user_hat) + epsilon)

    # Adam updates for item_factors
    m_item = beta1 * m_item + (1 - beta1) * grad_item_factors
    v_item = beta2 * v_item + (1 - beta2) * (grad_item_factors ** 2)
    m_item_hat = m_item / (1 - beta1 ** (iteration + 1))  # Bias correction
    v_item_hat = v_item / (1 - beta2 ** (iteration + 1))  # Bias correction
    item_factors -= learning_rate * m_item_hat / (np.sqrt(v_item_hat) + epsilon)


In [56]:
def recommend_for_user(user_id, top_n=5):
    user_idx = list(user_mapping.keys())[list(user_mapping.values()).index(user_id)]
    predictions = user_factors[user_idx] @ item_factors.T
    top_items = np.argsort(predictions)[::-1][:top_n]
    return [item_mapping[item] for item in top_items]

# Example: Recommend for a user
user_id = 15609489
recommended_products = [
    data[data['product_id'] == pid]['product_title'].iloc[0]
    for pid in recommend_for_user(user_id, top_n=5)
]
user_products = data[data['customer_id']==user_id]['product_title']
print(recommended_products)
print(user_products)

['CABTE High speed HDMI 1.4 HDMI cable 10ft 1080p with mesh&filters supports 3D&blue ray', 'Harman Kardon Soundsticks III 2.1 Channel Multimedia Speaker System with Subwoofer', 'Apple iPod classic 160 GB (7th Generation) NEWEST MODEL', 'Sony ICFCDK50 Under Cabinet Kitchen CD Clock Radio', 'Eforcity HDMI Cable, 10 feet']
0    eforCity 406110 HDMI to 3-RCA Male Video Compo...
Name: product_title, dtype: object
