In [21]:
import pandas as pd
import numpy as np
data = pd.read_csv(r'C:\Users\Mohamed Magdy\Downloads\archive (1)\data.csv', encoding='latin1')

In [22]:
# Data preprocessing
data['Description'] = data['Description'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

# Country to district mapping
country_to_district = {
    # Europe
    'United Kingdom': 'Europe',
    'France': 'Europe',
    'Germany': 'Europe',
    'Spain': 'Europe',
    'Portugal': 'Europe',
    'Italy': 'Europe',
    'Netherlands': 'Europe',
    'Switzerland': 'Europe',
    'Belgium': 'Europe',
    'Austria': 'Europe',
    'Sweden': 'Sweden',
    'Finland': 'Europe',
    'Denmark': 'Europe',
    'Norway': 'Europe',
    'Lithuania': 'Europe',
    'Greece': 'Europe',
    'Poland': 'Europe',
    'Cyprus': 'Europe',
    'Malta': 'Europe',
    'Iceland': 'Europe',
    'Channel Islands': 'Europe',
    'European Community': 'Europe',

    # Middle East
    'Saudi Arabia': 'Middle East',
    'Lebanon': 'Middle East',
    'United Arab Emirates': 'Middle East',
    'Israel': 'Middle East',
    'Bahrain': 'Middle East',

    # Asia-Pacific
    'Japan': 'Asia-Pacific',
    'Singapore': 'Asia-Pacific',

    # North America
    'USA': 'North America',
    'Canada': 'North America',

    # Other
    'Australia': 'Oceania',
    'EIRE': 'Europe',
    'Brazil': 'South America',
    'RSA': 'Africa',
    'Czech Republic': 'Europe',
    'Unspecified': 'Unknown'
}
data['District'] = data['Country'].map(country_to_district)

In [23]:
# TF-IDF setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict


data['Description'] = data['Description'].fillna('')

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['Description'])
small_tfidf = tfidf_matrix[:10000]
small_data = data.iloc[:10000].copy()

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Cosine similarity matrix
cosine_sim_matrix = cosine_similarity(small_tfidf)

TF-IDF matrix shape: (541909, 2338)


In [24]:
# FIXED: Content-based function - now returns similarity scores for hybrid approach
def get_content_similarity_scores(input_description, data, tfidf_matrix):
    """
    Returns similarity scores for all items based on content
    """
    # Preprocess input description
    clean_input = input_description.lower()
    clean_input = ''.join(char for char in clean_input if char.isalnum() or char.isspace())

    # Vectorize input
    input_vec = vectorizer.transform([clean_input])

    # Compute similarity between input and all items
    sim_scores = linear_kernel(input_vec, tfidf_matrix).flatten()

    return sim_scores

def get_content_recommendations(input_description, data, tfidf_matrix, top_n=5):
    """
    Original content-based recommendation function (kept for standalone use)
    """
    sim_scores = get_content_similarity_scores(input_description, data, tfidf_matrix)
    sim_indices = sim_scores.argsort()[::-1]

    # Filter out exact duplicates of input description
    clean_input = input_description.lower()
    clean_input = ''.join(char for char in clean_input if char.isalnum() or char.isspace())

    seen = set()
    recommendations = []
    for i in sim_indices:
        desc = data.iloc[i]['Description']
        if desc != clean_input and desc not in seen:
            seen.add(desc)
            recommendations.append(i)
        if len(recommendations) == top_n:
            break

    return data.iloc[recommendations][['StockCode', 'Description','UnitPrice']]

In [25]:
# FIXED: Collaborative filtering function - now returns similarity scores
def get_collaborative_similarity_scores(data, target_user_id, target_stockcode=None):
    """
    Returns similarity scores for all items based on collaborative filtering
    If target_stockcode is provided, returns scores for that specific item
    """
    # Create user-item matrix
    user_item_matrix = data.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', aggfunc='mean').fillna(0)

    # Check if target user exists
    if target_user_id not in user_item_matrix.index:
        return np.zeros(len(user_item_matrix.columns))

    # Get target user's district
    target_district = data[data['CustomerID'] == target_user_id]['District'].mode().values[0]

    # Filter to only customers in the same district
    same_district_users = data[data['District'] == target_district]['CustomerID'].unique()
    same_district_users = [uid for uid in same_district_users if uid in user_item_matrix.index and uid != target_user_id]

    if not same_district_users:
        return np.zeros(len(user_item_matrix.columns))

    # Create new matrix only for these users
    filtered_user_item_matrix = user_item_matrix.loc[[target_user_id] + same_district_users]

    # Compute cosine similarity
    cosine_sim_matrix = cosine_similarity(filtered_user_item_matrix)
    target_index = filtered_user_item_matrix.index.get_loc(target_user_id)
    similarities = cosine_sim_matrix[target_index]

    # Get top similar users (excluding self)
    similar_users = [(uid, similarities[i]) for i, uid in enumerate(filtered_user_item_matrix.index) if uid != target_user_id]
    top_similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[:10]

    if not top_similar_users:
        return np.zeros(len(user_item_matrix.columns))

    top_user_ids = [uid for uid, _ in top_similar_users]

    # Calculate item scores based on similar users
    target_user_row = user_item_matrix.loc[target_user_id]
    item_scores = np.zeros(len(user_item_matrix.columns))

    for uid in top_user_ids:
        similar_user_row = user_item_matrix.loc[uid]
        # Only recommend items the user hasn't bought
        for i, (item, quantity) in enumerate(similar_user_row.items()):
            if target_user_row[item] == 0 and quantity > 0:  # User hasn't bought but similar user has
                item_scores[i] += quantity

    return item_scores

def collaborative_filtering_recommendations(data, target_user_id, top_n=10):
    """
    Original collaborative filtering function (kept for standalone use)
    """
    # Create user-item matrix
    user_item_matrix = data.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', aggfunc='mean').fillna(0)

    # Check if target user exists
    if target_user_id not in user_item_matrix.index:
        return f"CustomerID {target_user_id} not found."

    # Get target user's district
    target_district = data[data['CustomerID'] == target_user_id]['District'].mode().values[0]

    # Filter to only customers in the same district
    same_district_users = data[data['District'] == target_district]['CustomerID'].unique()
    same_district_users = [uid for uid in same_district_users if uid in user_item_matrix.index and uid != target_user_id]

    # Create new matrix only for these users
    filtered_user_item_matrix = user_item_matrix.loc[[target_user_id] + same_district_users]

    # Compute cosine similarity
    cosine_sim_matrix = cosine_similarity(filtered_user_item_matrix)
    target_index = filtered_user_item_matrix.index.get_loc(target_user_id)
    similarities = cosine_sim_matrix[target_index]

    # Get top 10 similar users (excluding self)
    similar_users = [(uid, similarities[i]) for i, uid in enumerate(filtered_user_item_matrix.index) if uid != target_user_id]
    top_similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[:10]
    top_user_ids = [uid for uid, _ in top_similar_users]

    # Recommendation logic
    target_user_row = user_item_matrix.loc[target_user_id]
    item_quantity_map = defaultdict(float)
    item_user_map = defaultdict(list)

    for uid in top_user_ids:
        similar_user_row = user_item_matrix.loc[uid]
        recommended_mask = (similar_user_row > 0) & (target_user_row == 0)
        recommended_items = user_item_matrix.columns[recommended_mask]

        for item in recommended_items:
            item_quantity_map[item] += similar_user_row[item]
            item_user_map[item].append(uid)

    # Build result
    if not item_quantity_map:
        return pd.DataFrame(columns=['StockCode', 'Description', 'TotalQuantity', 'Users', 'District'])

    rec_df = pd.DataFrame([
        {
            'StockCode': item,
            'TotalQuantity': item_quantity_map[item],
            'Users': item_user_map[item],
            'District': target_district
        }
        for item in item_quantity_map
    ])

    rec_df = rec_df.sort_values(by='TotalQuantity', ascending=False).head(top_n)
    rec_df = rec_df.merge(data[['StockCode', 'Description']].drop_duplicates(), on='StockCode', how='left')

    return rec_df[['StockCode', 'Description', 'TotalQuantity', 'Users', 'District']]


In [26]:
def weighted_hybrid_recommendations(input_description, target_user_id, data, tfidf_matrix, alpha=0.6, top_n=5):
    """
    Weighted hybrid recommendation combining content-based and collaborative filtering

    Parameters:
    - input_description: Description of item for content-based similarity
    - target_user_id: User ID for collaborative filtering
    - data: The dataset
    - tfidf_matrix: TF-IDF matrix for content-based filtering
    - alpha: Weight for content-based (0-1, where 1 = only content-based, 0 = only collaborative)
    - top_n: Number of recommendations to return
    """

    # Get content-based similarity scores
    content_scores = get_content_similarity_scores(input_description, data, tfidf_matrix)

    # Get collaborative filtering similarity scores
    collab_scores = get_collaborative_similarity_scores(data, target_user_id)

    # Handle different lengths by taking minimum length or padding
    min_len = min(len(content_scores), len(collab_scores))
    content_scores = content_scores[:min_len]
    collab_scores = collab_scores[:min_len]

    # Normalize scores to 0-1 range
    scaler = MinMaxScaler()

    # Handle edge case where all scores are the same
    if len(set(content_scores)) > 1:
        content_scores_norm = scaler.fit_transform(content_scores.reshape(-1, 1)).flatten()
    else:
        content_scores_norm = content_scores

    if len(set(collab_scores)) > 1:
        collab_scores_norm = scaler.fit_transform(collab_scores.reshape(-1, 1)).flatten()
    else:
        collab_scores_norm = collab_scores

    # Combine using weighted average
    final_scores = alpha * content_scores_norm + (1 - alpha) * collab_scores_norm

    # Get top N recommendations
    top_indices = final_scores.argsort()[::-1][:top_n*2]  # Get more to filter out duplicates

    # Filter out items that are too similar to input or already purchased
    clean_input = input_description.lower()
    clean_input = ''.join(char for char in clean_input if char.isalnum() or char.isspace())

    recommendations = []
    seen_descriptions = set()

    for idx in top_indices:
        if idx < len(data):
            item_desc = data.iloc[idx]['Description']
            item_stock = data.iloc[idx]['StockCode']

            # Skip if too similar to input or already seen
            if (item_desc != clean_input and
                item_desc not in seen_descriptions and
                final_scores[idx] > 0):  # Only include items with positive scores

                recommendations.append({
                    'StockCode': item_stock,
                    'Description': item_desc,
                    'Content_Score': content_scores_norm[idx],
                    'Collab_Score': collab_scores_norm[idx],
                    'Final_Score': final_scores[idx]
                })
                seen_descriptions.add(item_desc)

                if len(recommendations) >= top_n:
                    break

    return pd.DataFrame(recommendations)

In [27]:
# # EXAMPLE USAGE:
# # Replace with actual values from your dataset
# example_description = "christmas decorative items"
# example_user_id = 12345  # Replace with actual CustomerID
# alpha = 0.6  # 60% content-based, 40% collaborative
#
# # Get hybrid recommendations
# hybrid_results = weighted_hybrid_recommendations(
#     input_description=example_description,
#     target_user_id=example_user_id,
#     data=data,  # or small_data if using subset
#     tfidf_matrix=tfidf_matrix,  # or small_tfidf if using subset
#     alpha=alpha,
#     top_n=5
# )
#
# print("Hybrid Recommendations:")
# print(hybrid_results)

In [28]:
get_content_recommendations("WHITE METAL LANTERN", small_data, small_tfidf)

Unnamed: 0,StockCode,Description,UnitPrice
8529,22464,hanging metal heart lantern,1.65
5805,22224,white lovebird lantern,5.91
452,22465,hanging metal star lantern,1.65
933,22784,lantern cream gazebo,4.95
158,21324,hanging medina lantern small,2.95


In [29]:
collaborative_filtering_recommendations(small_data, 13047)

Unnamed: 0,StockCode,Description,TotalQuantity,Users,District
0,84946,antique silver tea glass etched,216.0,[15061.0],Europe
1,16237,sleeping cat erasers,180.0,[15061.0],Europe
2,84347,rotating silver angels tlight hldr,180.0,[15061.0],Europe
3,22423,regency cakestand 3 tier,150.0,"[16552.0, 15061.0, 17287.0, 17732.0]",Europe
4,22492,mini paint set vintage,144.0,[15061.0],Europe
5,21499,blue polkadot wrap,125.0,[15061.0],Europe
6,21498,red retrospot wrap,125.0,[15061.0],Europe
7,21212,pack of 72 retrospot cake cases,121.5,"[15061.0, 17968.0]",Europe
8,71459,hanging jam jar tlight holder,120.0,[15061.0],Europe
9,22467,gumball coat rack,109.0,"[15061.0, 15545.0]",Europe


In [30]:
weighted_hybrid_recommendations("WHITE METAL LANTERN", 17850, small_data, small_tfidf, alpha=0.6, top_n=5)

Unnamed: 0,StockCode,Description,Content_Score,Collab_Score,Final_Score
0,22464,hanging metal heart lantern,0.701896,0.0,0.421138
1,21657,milk bottle with glass stopper,0.0,1.0,0.4
2,22224,white lovebird lantern,0.648758,0.0,0.389255


In [31]:
# # Simple test codes for each function
#
# # 1. Test Content-Based Recommendations
# print("Testing Content-Based:")
# get_content_recommendations("WHITE METAL LANTERN", small_data, small_tfidf)
#
# # 2. Test Collaborative Filtering
# print("\nTesting Collaborative Filtering:")
# # Use an actual customer ID from your data - replace 12345 with real CustomerID
# collaborative_filtering_recommendations(small_data, 12345)
#
# # 3. Test Hybrid Recommendations
# print("\nTesting Hybrid:")
# weighted_hybrid_recommendations("WHITE METAL LANTERN", 12345, small_data, small_tfidf, alpha=0.6)
#
# # To find actual customer IDs in your data, use:
# print("\nAvailable Customer IDs:")
# print(small_data['CustomerID'].unique()[:10])  # Shows first 10 customer IDs