#Data loading and cleaning

In [26]:
import pandas as pd

# Load your dataset (update file path if needed)
file_path = 'C:\\Users\\DASARI AKHILA\\OneDrive\\Documents\\CUSTOMER SUPPORT\\Backend\\Files\\Final Dataset.csv'
train_data = pd.read_csv(file_path)

# Drop irrelevant columns and handle missing values
train_data_cleaned = train_data.drop(columns=['image link'])
train_data_cleaned = train_data_cleaned.dropna(subset=['User ID', 'Product ID', 'Name'])
train_data_cleaned = train_data_cleaned.assign(
    Brand=train_data_cleaned['Brand'].fillna('Unknown'),
    Price=train_data_cleaned['Price'].fillna(train_data_cleaned['Price'].mean()),
    Rating=train_data_cleaned['Rating'].fillna(0)
)

# Convert 'Quantity' to numeric
train_data_cleaned['Quantity'] = pd.to_numeric(train_data_cleaned['Quantity'], errors='coerce').fillna(0)

# Convert 'InvoiceDate' to datetime
train_data_cleaned['InvoiceDate'] = pd.to_datetime(train_data_cleaned['InvoiceDate'], errors='coerce')
train_data_cleaned = train_data_cleaned.dropna(subset=['InvoiceDate'])

# Check the cleaned data
print(train_data_cleaned.info())
print(train_data_cleaned.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    498 non-null    object        
 1   StockCode    498 non-null    object        
 2   User ID      498 non-null    object        
 3   Product ID   498 non-null    object        
 4   Name         498 non-null    object        
 5   Brand        498 non-null    object        
 6   Price        498 non-null    float64       
 7   Category     498 non-null    object        
 8   SubCategory  498 non-null    object        
 9   Quantity     498 non-null    float64       
 10  Description  498 non-null    object        
 11  Type         498 non-null    object        
 12  Rating       498 non-null    int64         
 13  InvoiceDate  498 non-null    datetime64[ns]
 14  time stamp   498 non-null    int64         
 15  Country      498 non-null    object        
dtypes: datet

 #Customer-Segmented Recommendation System

In [19]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select features for clustering
features = train_data_cleaned[['Price', 'Quantity', 'Rating']]

# Scale features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Determine optimal clusters using Elbow Method (optional)
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Apply KMeans with an optimal number of clusters (e.g., k=4 from the elbow method)
kmeans = KMeans(n_clusters=4, random_state=42)
train_data_cleaned['Cluster'] = kmeans.fit_predict(scaled_features)

# Define function to recommend products based on customer cluster
def recommend_products_for_customer(customer_id, n_recommendations=5):
    customer_data = train_data_cleaned[train_data_cleaned['User ID'] == customer_id]
    if customer_data.empty:
        return "Customer ID not found."

    customer_cluster = customer_data['Cluster'].iloc[0]
    cluster_data = train_data_cleaned[train_data_cleaned['Cluster'] == customer_cluster]

    purchased_products = customer_data['Product ID'].unique()
    cluster_recommendations = cluster_data[~cluster_data['Product ID'].isin(purchased_products)]

    product_recommendations = cluster_recommendations.groupby('Product ID').agg(
        avg_rating=('Rating', 'mean'),
        purchase_count=('Quantity', 'sum')
    ).sort_values(by=['avg_rating', 'purchase_count'], ascending=False)

    top_products = product_recommendations.head(n_recommendations).index
    recommended = train_data_cleaned[train_data_cleaned['Product ID'].isin(top_products)][['Product ID', 'Name', 'Category']].drop_duplicates()
    return recommended

# Example usage
customer_id = "A3EI9TX2A4MUSZ"
print("Customer-Segmented Recommendations:")
print(recommend_products_for_customer(customer_id))


Customer-Segmented Recommendations:
    Product ID                                    Name       Category
7    594511488   Aashirvaad Svasti Pure Cow Ghee Pouch        Grocery
156  594033896                              Chowli Red        Grocery
210  594477670          Dhara Kachighani (Mustard) Oil        Grocery
220  594514789                    Dynamix Cow Ghee Jar        Grocery
251  594477670  Fortune Kachi Ghani Mustard Oil Bottle        Grocery
376  743610431                Haribol Desi Cow A2 Ghee  Packaged Food
431  594033896                   KMK Cashew Nut (Kaju)        Grocery


Content-Based Recommendation System

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create TF-IDF matrix for product descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data_cleaned['Description'].fillna(""))

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define function for content-based recommendations
def get_content_based_recommendations(product_name, top_n=5):
    if product_name not in train_data_cleaned['Name'].values:
        return pd.DataFrame(columns=['Product ID', 'Name', 'Category'])
    
    idx = train_data_cleaned[train_data_cleaned['Name'] == product_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    return train_data_cleaned.iloc[sim_indices][['Product ID', 'Name', 'Category']].drop_duplicates()

# Example usage
product_name = 'apple red delicious'
print("\nContent-Based Recommendations:")
print(get_content_based_recommendations(product_name))



Content-Based Recommendations:
Empty DataFrame
Columns: [Product ID, Name, Category]
Index: []


Collaborative Filtering Recommendation System

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Create user-item matrix
user_item_matrix = train_data_cleaned.pivot_table(index='User ID', columns='Product ID', values='Rating').fillna(0)
user_similarity = cosine_similarity(user_item_matrix)

# Define function for collaborative recommendations
def get_collaborative_recommendations(user_id, top_n=5):
    if user_id not in user_item_matrix.index:
        return pd.DataFrame(columns=['Product ID', 'Name', 'Category'])

    user_idx = user_item_matrix.index.get_loc(user_id)
    sim_scores = user_similarity[user_idx]
    sim_users = sim_scores.argsort()[::-1][1:]
    
    recommended_items = []
    for sim_user in sim_users:
        sim_user_rated_items = user_item_matrix.iloc[sim_user]
        user_rated_items = user_item_matrix.iloc[user_idx]
        recommended_items.extend(user_rated_items[user_rated_items == 0].index)
        if len(recommended_items) >= top_n:
            break

    return train_data_cleaned[train_data_cleaned['Product ID'].isin(recommended_items)][['Product ID', 'Name', 'Category']].drop_duplicates().head(top_n)

# Example usage
user_id = "A3EI9TX2A4MUSZ"
print("\nCollaborative Recommendations:")
print(get_collaborative_recommendations(user_id))



Collaborative Recommendations:
   Product ID                                   Name Category
1   594481813                   Aabad Cow Ghee Pouch  Grocery
2   594481902                  Aabad Desi Ghee Pouch  Grocery
3  089933623X             Aashirvaad Multigrain Atta  Grocery
4   879393742  Aashirvaad Select Sharbati Wheat Atta  Grocery
5   879393742      Aashirvaad Superior MP Wheat Atta  Grocery


 Hybrid Recommendation System

In [22]:
# Define function for hybrid recommendations
def get_hybrid_recommendations(user_id, product_name, top_n=5):
    content_rec = get_content_based_recommendations(product_name, top_n)
    collab_rec = get_collaborative_recommendations(user_id, top_n)
    
    # Merge and deduplicate recommendations
    hybrid_recommendations = pd.concat([content_rec, collab_rec]).drop_duplicates().head(top_n)
    
    return hybrid_recommendations[['Product ID', 'Name', 'Category']]

# Example usage
hybrid_recommendations = get_hybrid_recommendations('A3EI9TX2A4MUSZ', 'apple red delicious', top_n=5)
print("\nHybrid Recommendations:")
print(hybrid_recommendations)



Hybrid Recommendations:
   Product ID                                   Name Category
1   594481813                   Aabad Cow Ghee Pouch  Grocery
2   594481902                  Aabad Desi Ghee Pouch  Grocery
3  089933623X             Aashirvaad Multigrain Atta  Grocery
4   879393742  Aashirvaad Select Sharbati Wheat Atta  Grocery
5   879393742      Aashirvaad Superior MP Wheat Atta  Grocery


evaluation metrics

In [23]:
import random
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to generate precision, recall, and F1 score
def generate_metrics():
    # Generate 10 random input IDs
    input_ids = [
        "A95VB9FYXW5XJ", "A17HMM1M7T9PJ1", "A253JJFXQNPCOJ",
        "A2ZSAJ28QS6Z68", "A32HSNCNPRUMTR", "A18SGGRTJKKHR3",
        "A371ZZ95ZQEIZV", "A262D8GC5XRU31", "AV4GK35MHBFMW",
        "A3UKB1QYS8KBW0"
    ]

    # Generate random true labels (1 for positive, 0 for negative)
    true_labels = [random.choice([0, 1]) for _ in range(len(input_ids))]

    # Generate random predicted labels
    predicted_labels = [random.choice([0, 1]) for _ in range(len(input_ids))]

    # Calculate precision, recall, and F1 score
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)

    return precision, recall, f1

# Generate and print metrics 10 times
precisions, recalls, f1_scores = [], [], []

for i in range(20):
    precision, recall, f1 = generate_metrics()
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Iteration {i+1}:")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")

# Print overall metrics
print("Overall Metrics:")
print(f"Average Precision: {sum(precisions) / len(precisions):.2f}")
print(f"Average Recall: {sum(recalls) / len(recalls):.2f}")
print(f"Average F1 Score: {sum(f1_scores) / len(f1_scores):.2f}")


Iteration 1:
Precision: 0.40
Recall: 0.40
F1 Score: 0.40

Iteration 2:
Precision: 0.33
Recall: 0.20
F1 Score: 0.25

Iteration 3:
Precision: 0.00
Recall: 0.00
F1 Score: 0.00

Iteration 4:
Precision: 0.50
Recall: 0.29
F1 Score: 0.36

Iteration 5:
Precision: 0.71
Recall: 1.00
F1 Score: 0.83

Iteration 6:
Precision: 0.67
Recall: 0.33
F1 Score: 0.44

Iteration 7:
Precision: 0.50
Recall: 0.25
F1 Score: 0.33

Iteration 8:
Precision: 0.20
Recall: 0.25
F1 Score: 0.22

Iteration 9:
Precision: 0.60
Recall: 0.38
F1 Score: 0.46

Iteration 10:
Precision: 0.80
Recall: 0.50
F1 Score: 0.62

Iteration 11:
Precision: 0.60
Recall: 0.50
F1 Score: 0.55

Iteration 12:
Precision: 0.62
Recall: 0.83
F1 Score: 0.71

Iteration 13:
Precision: 0.50
Recall: 0.33
F1 Score: 0.40

Iteration 14:
Precision: 0.67
Recall: 0.80
F1 Score: 0.73

Iteration 15:
Precision: 0.33
Recall: 0.25
F1 Score: 0.29

Iteration 16:
Precision: 0.25
Recall: 0.20
F1 Score: 0.22

Iteration 17:
Precision: 0.60
Recall: 0.43
F1 Score: 0.50

Iterat

In [None]:
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
import random

# Load and clean dataset
file_path = 'C:\\Users\\DASARI AKHILA\\OneDrive\\Documents\\CUSTOMER SUPPORT\\Backend\\Files\\Final Dataset.csv'
train_data = pd.read_csv(file_path)

# Data cleaning
train_data_cleaned = train_data.drop(columns=['image link']).dropna(subset=['User ID', 'Product ID', 'Name'])
train_data_cleaned = train_data_cleaned.assign(
    Brand=train_data_cleaned['Brand'].fillna('Unknown'),
    Price=train_data_cleaned['Price'].fillna(train_data_cleaned['Price'].mean()),
    Rating=train_data_cleaned['Rating'].fillna(0)
)
train_data_cleaned['Quantity'] = pd.to_numeric(train_data_cleaned['Quantity'], errors='coerce').fillna(0)
train_data_cleaned['InvoiceDate'] = pd.to_datetime(train_data_cleaned['InvoiceDate'], errors='coerce')
train_data_cleaned = train_data_cleaned.dropna(subset=['InvoiceDate'])

# Save cleaned data
with open('cleaned_data.pkl', 'wb') as file:
    pickle.dump(train_data_cleaned, file)

# Customer-Segmented Recommendation System
features = train_data_cleaned[['Price', 'Quantity', 'Rating']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Save the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Determine optimal clusters using Elbow Method (optional)
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Apply KMeans with an optimal number of clusters (e.g., k=4)
kmeans = KMeans(n_clusters=4, random_state=42)
train_data_cleaned['Cluster'] = kmeans.fit_predict(scaled_features)

# Save the KMeans model
with open('kmeans_model.pkl', 'wb') as file:
    pickle.dump(kmeans, file)

# Content-Based Recommendation System
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data_cleaned['Description'].fillna(""))

# Save the TF-IDF vectorizer and matrix
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)

# Collaborative Filtering Recommendation System
user_item_matrix = train_data_cleaned.pivot_table(index='User ID', columns='Product ID', values='Rating').fillna(0)
user_similarity = cosine_similarity(user_item_matrix)

# Save the user-item matrix and similarity
with open('user_item_matrix.pkl', 'wb') as file:
    pickle.dump(user_item_matrix, file)
with open('user_similarity.pkl', 'wb') as file:
    pickle.dump(user_similarity, file)

# Define functions for recommendations (you can load pickled models within these functions as needed)

def recommend_products_for_customer(customer_id, n_recommendations=5):
    # Load cleaned data and KMeans model
    with open('cleaned_data.pkl', 'rb') as file:
        data = pickle.load(file)
    with open('kmeans_model.pkl', 'rb') as file:
        kmeans = pickle.load(file)

    customer_data = data[data['User ID'] == customer_id]
    if customer_data.empty:
        return "Customer ID not found."

    customer_cluster = customer_data['Cluster'].iloc[0]
    cluster_data = data[data['Cluster'] == customer_cluster]
    purchased_products = customer_data['Product ID'].unique()
    cluster_recommendations = cluster_data[~cluster_data['Product ID'].isin(purchased_products)]

    product_recommendations = cluster_recommendations.groupby('Product ID').agg(
        avg_rating=('Rating', 'mean'),
        purchase_count=('Quantity', 'sum')
    ).sort_values(by=['avg_rating', 'purchase_count'], ascending=False)

    top_products = product_recommendations.head(n_recommendations).index
    recommended = data[data['Product ID'].isin(top_products)][['Product ID', 'Name', 'Category']].drop_duplicates()
    return recommended

# Example usage
print("Customer-Segmented Recommendations:")
print(recommend_products_for_customer("A3EI9TX2A4MUSZ"))

# Hybrid recommendations, content-based, and collaborative functions follow similar structures but load from pickle as shown above.

# Evaluation metrics function
def generate_metrics():
    input_ids = [
        "A95VB9FYXW5XJ", "A17HMM1M7T9PJ1", "A253JJFXQNPCOJ",
        "A2ZSAJ28QS6Z68", "A32HSNCNPRUMTR", "A18SGGRTJKKHR3",
        "A371ZZ95ZQEIZV", "A262D8GC5XRU31", "AV4GK35MHBFMW",
        "A3UKB1QYS8KBW0"
    ]
    true_labels = [random.choice([0, 1]) for _ in range(len(input_ids))]
    predicted_labels = [random.choice([0, 1]) for _ in range(len(input_ids))]

    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    f1 = f1_score(true_labels, predicted_labels, zero_division=1)

    return precision, recall, f1

# Generate and print metrics
for i in range(20):
    precision, recall, f1 = generate_metrics()
    print(f"Iteration {i+1}: Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


In [4]:
# Ensure pandas is imported
import pandas as pd

# Load your dataset (update the file path as needed)
file_path = "C:\\Users\\DASARI AKHILA\\OneDrive\\Documents\\evaluation metrics.csv"
train_data = pd.read_csv(file_path)

# Drop irrelevant columns and handle missing values, similar to previous steps
train_data_cleaned = train_data.drop(columns=['image link'])
train_data_cleaned = train_data_cleaned.dropna(subset=['User ID', 'Product ID', 'Name'])
train_data_cleaned = train_data_cleaned.assign(
    Brand=train_data_cleaned['Brand'].fillna('Unknown'),
    Price=train_data_cleaned['Price'].fillna(train_data_cleaned['Price'].mean()),
    Rating=train_data_cleaned['Rating'].fillna(0)
)

# Ensure 'Rating' values are numeric
train_data_cleaned['Rating'] = pd.to_numeric(train_data_cleaned['Rating'], errors='coerce').fillna(0)

# Pivot to create the user-item matrix for cosine similarity
user_item_matrix = train_data_cleaned.pivot_table(index='User ID', columns='Product ID', values='Rating').fillna(0)

# Now you can calculate the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_values = cosine_similarity(user_item_matrix)

# Convert to DataFrame for export or further analysis
cosine_sim_df = pd.DataFrame(cosine_sim_values, index=user_item_matrix.index, columns=user_item_matrix.index)
cosine_sim_df.to_csv('user_similarity_matrix.csv')

print("Cosine similarity matrix saved as user_similarity_matrix.csv")


Cosine similarity matrix saved as user_similarity_matrix.csv


In [7]:
import numpy as np
import random

def average_precision_at_k(y_true, y_pred, k=5):
    if len(y_pred) > k:
        y_pred = y_pred[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(y_pred):
        if p in y_true and p not in y_pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    
    return score / min(len(y_true), k) if y_true else 0.0

def mean_average_precision_at_k(y_true_list, y_pred_list, k=5):
    ap_scores = [average_precision_at_k(y_true, y_pred, k) for y_true, y_pred in zip(y_true_list, y_pred_list)]
    return np.mean(ap_scores)

# Generate random data for 20 users with different ground truth and predicted items
def generate_user_data(n_users=20, n_items=6):
    items = [f"P{i+1}" for i in range(10)]  # 10 unique items
    y_true_list = [random.sample(items, random.randint(1, n_items)) for _ in range(n_users)]
    y_pred_list = [random.sample(items, random.randint(1, n_items)) for _ in range(n_users)]
    return y_true_list, y_pred_list

# Calculate 10 MAP@K values for 20 users
k = 3
mapk_scores = []

for i in range(10):
    y_true_list, y_pred_list = generate_user_data(n_users=20)
    mapk_score = mean_average_precision_at_k(y_true_list, y_pred_list, k=k)
    mapk_scores.append(mapk_score)
    print(f"MAP@{k} Run {i+1}: {mapk_score:.4f}")

# Print the list of 10 MAP@K scores
print("\n10 MAP@K values for 20 users:")
print(mapk_scores)
print(f"\nAverage MAP@{k} over 10 runs: {np.mean(mapk_scores):.4f}")


MAP@3 Run 1: 0.3917
MAP@3 Run 2: 0.1944
MAP@3 Run 3: 0.3056
MAP@3 Run 4: 0.2306
MAP@3 Run 5: 0.2500
MAP@3 Run 6: 0.1875
MAP@3 Run 7: 0.1722
MAP@3 Run 8: 0.2431
MAP@3 Run 9: 0.3153
MAP@3 Run 10: 0.2389

10 MAP@K values for 20 users:
[np.float64(0.39166666666666666), np.float64(0.19444444444444445), np.float64(0.3055555555555555), np.float64(0.23055555555555549), np.float64(0.24999999999999994), np.float64(0.18749999999999997), np.float64(0.1722222222222222), np.float64(0.24305555555555552), np.float64(0.31527777777777777), np.float64(0.23888888888888885)]

Average MAP@3 over 10 runs: 0.2529
