# **Import Library**

In [1]:
from pathlib import Path
import joblib
import pandas as pd
from sklearn.pipeline import make_pipeline
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# **Load Dataset**

In [2]:
data_path = Path.cwd().parent / 'data' / 'cleaned_amazon_reviews.csv'

amazon_df = pd.read_csv(data_path)

In [3]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    return 1 if sentiment_score['compound'] >= 0.05 else 0 

amazon_df['sentiment'] = amazon_df['reviews.text'].apply(analyze_sentiment)

In [None]:
sentiment_summary = amazon_df.groupby('asins').agg({
    'sentiment': ['sum', 'count']
}).reset_index()
sentiment_summary.columns = ['asins', 'positive_count', 'total_reviews']
sentiment_summary['positive_ratio'] = sentiment_summary['positive_count'] / sentiment_summary['total_reviews'] * 100

avg_rating_dict = amazon_df.groupby('asins')['reviews.rating'].mean().to_dict()

high_quality_products = sentiment_summary[sentiment_summary['positive_ratio'] > 80]

In [None]:
reviews_text_per_asin = amazon_df.groupby('asins')['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

tfidf_data = reviews_text_per_asin.merge(high_quality_products[['asins']], on='asins', how='inner')

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(tfidf_data['reviews.text'])

similarity_matrix_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(asin, keyword=None, top_n=5):
    if asin not in high_quality_products['asins'].values:
        return [{"error": f"ASIN {asin} not found or has low positive ratio (<80%)."}]

    asin_idx = tfidf_data[tfidf_data['asins'] == asin].index[0]
    sim_scores = list(enumerate(similarity_matrix_tfidf[asin_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = [score for score in sim_scores if score[0] != asin_idx][:top_n]

    recommendations = []
    for idx, score in sim_scores:
        rec_asin = tfidf_data.iloc[idx]['asins']
        product_data = amazon_df[amazon_df['asins'] == rec_asin].iloc[0]
        recommendations.append({
            'asins': rec_asin,
            'name': product_data['name'],
            'categories': product_data['categories'],
            'average_rating': round(avg_rating_dict.get(rec_asin, 0), 2),
            'positive_ratio': round(
                high_quality_products[high_quality_products['asins'] == rec_asin]['positive_ratio'].iloc[0], 2),
            'similarity_score': round(score, 2)
        })

    if keyword:
        keyword_tfidf = tfidf.transform([keyword])
        keyword_sim = cosine_similarity(keyword_tfidf, tfidf_matrix)
        keyword_scores = list(enumerate(keyword_sim[0]))
        keyword_scores = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
        keyword_recommendations = [tfidf_data.iloc[idx]['asins'] for idx, _ in keyword_scores[:top_n]]
        recommendations = [rec for rec in recommendations if rec['asins'] in keyword_recommendations]

    return recommendations[:top_n]  

In [7]:
sample_asin = 'B01AHB9CN2'
recommendations = get_recommendations(sample_asin, keyword="tablet", top_n=3)
print("Recommendations for ASIN:", sample_asin)
for rec in recommendations:
    print(rec)

Recommendations for ASIN: B01AHB9CN2
{'asins': 'B018Y229OU', 'name': 'Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta', 'categories': 'Fire Tablets', 'average_rating': 4.45, 'positive_ratio': np.float64(89.52), 'similarity_score': np.float64(0.97)}
{'asins': 'B018Y225IA', 'name': 'Brand New Amazon Kindle Fire 16gb 7 Ips Display Tablet Wifi 16 Gb Blue,,,', 'categories': 'Computers/Tablets & Networking', 'average_rating': 4.5, 'positive_ratio': np.float64(86.99), 'similarity_score': np.float64(0.96)}


In [8]:
sentiment_summary_path = Path.cwd().parent / 'data' / 'sentiment_summary.csv'
high_quality_products_path = Path.cwd().parent / 'data' / 'high_quality_products.csv'

sentiment_summary.to_csv(sentiment_summary_path, index=False)
high_quality_products.to_csv(high_quality_products_path, index=False)

# **Save Vectorizer & Matrix**

In [9]:
vectorizer_path = Path.cwd().parent / 'model' / 'tfidf_vectorizer_product.pkl'
matrix_path = Path.cwd().parent / 'model' / 'tfidf_matrix_product.pkl'

joblib.dump(tfidf, vectorizer_path)
joblib.dump(tfidf_matrix, matrix_path)

print("TF-IDF Vectorizer Saved To: model/tfidf_vectorizer_product.pkl")
print("TF-IDF Matrix Saved To: model/tfidf_matrix_product.pkl")

TF-IDF Vectorizer Saved To: model/tfidf_vectorizer_product.pkl
TF-IDF Matrix Saved To: model/tfidf_matrix_product.pkl
