In [20]:
import kagglehub
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [21]:
path = kagglehub.dataset_download("carrie1/ecommerce-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/ecommerce-data


In [22]:
csv_files = [file for file in os.listdir(path) if file.endswith('.csv')]
print("CSV files:", csv_files)

if csv_files:
    df = pd.read_csv(os.path.join(path, csv_files[0]), encoding='ISO-8859-1')  # or UTF-8 depending on file
    print("Loaded DataFrame shape:", df.shape)
else:
    print("No CSV file found in the dataset.")

CSV files: ['data.csv']
Loaded DataFrame shape: (541909, 8)


In [23]:
# Drop rows with missing CustomerID and create a new DataFrame
df = df.dropna(subset=['CustomerID','Description']).copy()

# Now safely convert to int
df['CustomerID'] = df['CustomerID'].astype(int)

In [24]:
df['Description'].nunique()

3896

In [6]:
df.columns = df.columns.str.strip()  # removes leading/trailing spaces

In [7]:
# Filter rows where StockCode contains any non-digit character
non_numeric_stockcodes = df[~df['StockCode'].str.isdigit()]

# Show unique string values in StockCode
unique_string_values = non_numeric_stockcodes['StockCode'].unique()
print(unique_string_values)

['85123A' '84406B' '84029G' '84029E' 'POST' '82494L' '85099C' '84997B'
 '84997C' '84519A' '85183B' '85071B' '37444A' '37444C' '84971S' '15056BL'
 '15056N' 'D' '35004C' '85049A' '85099B' '35004G' '85014B' '85014A'
 '84970S' '84030E' '35004B' '85049E' '17091A' '84509A' '84510A' '84709B'
 '84625C' '84625A' '47570B' '85049C' '85049D' '85049G' '84970L' '90199C'
 '90129F' '90210B' '72802C' '85169B' '85099F' '85184C' '35591T' '84032B'
 '85049H' '72800E' '84849B' '90200B' '90059B' '90185C' '90059E' '90059C'
 '90200C' '90200D' '90200A' '16258A' '85231B' '85231G' '48173C' '47563A'
 '84558A' '46000M' '71406C' '84985A' '84596E' '84997D' '47599A' '47599B'
 '85035B' '84968C' '72800B' '84563A' '47504H' '17164B' '15044B' '84569B'
 '85114B' '85114C' '85199L' '85199S' '85019A' '85019C' '85071A' '85071C'
 '85135B' '85136A' '85136C' 'C2' '79144B' '46000R' '46000S' '84508A'
 '85232B' '79066K' '84884A' '51014C' '51014L' '51014A' '79302M' '84509B'
 '84870C' '84870B' 'M' '85032D' '84760S' '35598D' '35598B' '1

In [8]:
import pandas as pd

# Filter StockCodes with letters only (no numbers)
letters_only = df[df['StockCode'].str.match(r'^[A-Za-z]+$', na=False)]

# Count and percentage
letters_only_count = letters_only.shape[0]
total_count = df.shape[0]
percentage_letters_only = (letters_only_count / total_count) * 100

# Output
print(f"StockCodes with letters only: {letters_only_count} ({percentage_letters_only:.2f}%)")

StockCodes with letters only: 1774 (0.44%)


In [9]:
# Drop rows where StockCode contains only letters (no numbers)
df = df[~df['StockCode'].str.match(r'^[A-Za-z]+$', na=False)]

# Optional: check shape before and after
print("Original shape:", df.shape)
print("New shape after dropping letter-only StockCodes:", df.shape)

Original shape: (405055, 8)
New shape after dropping letter-only StockCodes: (405055, 8)


In [10]:
import pandas as pd


# Drop nulls, remove duplicates, group by StockCode, and take the first description
df = df[df['StockCode'] != 'BANK CHARGES']
# Display all rows
print(df.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55       17850  United Kingdom  
1  12/1/2010 8:26       3.39       17850  United Kingdom  
2  12/1/2010 8:26       2.75       17850  United Kingdom  
3  12/1/2010 8:26       3.39       17850  United Kingdom  
4  12/1/2010 8:26       3.39       17850  United Kingdom  


In [11]:
df['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Greece', 'Singapore', 'Lebanon',
       'United Arab Emirates', 'Saudi Arabia', 'Czech Republic', 'Canada',
       'Unspecified', 'Brazil', 'USA', 'European Community', 'Bahrain',
       'Malta', 'RSA'], dtype=object)

In [12]:
country_to_district = {
    # Europe
    'United Kingdom': 'Europe',
    'France': 'Europe',
    'Germany': 'Europe',
    'Spain': 'Europe',
    'Portugal': 'Europe',
    'Italy': 'Europe',
    'Netherlands': 'Europe',
    'Switzerland': 'Europe',
    'Belgium': 'Europe',
    'Austria': 'Europe',
    'Sweden': 'Europe',
    'Finland': 'Europe',
    'Denmark': 'Europe',
    'Norway': 'Europe',
    'Lithuania': 'Europe',
    'Greece': 'Europe',
    'Poland': 'Europe',
    'Cyprus': 'Europe',
    'Malta': 'Europe',
    'Iceland': 'Europe',
    'Channel Islands': 'Europe',
    'European Community': 'Europe',

    # Middle East
    'Saudi Arabia': 'Middle East',
    'Lebanon': 'Middle East',
    'United Arab Emirates': 'Middle East',
    'Israel': 'Middle East',
    'Bahrain': 'Middle East',

    # Asia-Pacific
    'Japan': 'Asia-Pacific',
    'Singapore': 'Asia-Pacific',

    # North America
    'USA': 'North America',
    'Canada': 'North America',

    # Other
    'Australia': 'Oceania',
    'EIRE': 'Europe',
    'Brazil': 'South America',
    'RSA': 'Africa',
    'Czech Republic': 'Europe',
    'Unspecified': 'Unknown'
}
df['District'] = df['Country'].map(country_to_district)

In [13]:
df['District'].value_counts()

Unnamed: 0_level_0,count
District,Unnamed: 1_level_1
Europe,402053
Oceania,1257
Asia-Pacific,570
North America,441
Middle East,389
Unknown,244
Africa,57
South America,32


In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.shape

(399823, 9)

# Categorization

In [16]:
!pip install transformers -U



In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
descriptions = df['Description']
clean_descriptions = [desc for desc in descriptions if isinstance(desc, str) and len(desc.strip()) > 5]

embeddings = model.encode(clean_descriptions, show_progress_bar=True)
cos_sim_matrix = cosine_similarity(embeddings)


similarity_threshold = 0.60
n = len(clean_descriptions)
group_labels = [-1] * n
current_group = 0


for i in range(n):
    if group_labels[i] == -1:
        group_labels[i] = current_group
        for j in range(i + 1, n):
            if group_labels[j] == -1 and cos_sim_matrix[i][j] >= similarity_threshold:
                group_labels[j] = current_group
        current_group += 1


desc_to_group = dict(zip(clean_descriptions, group_labels))
df['Description_Categorize'] = df['Description'].map(desc_to_group)
print(df[['Description', 'Description_Categorize']])

# Collaborative Filtering

In [16]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'District'],
      dtype='object')

In [17]:
# # Prepare final mapping: user_id → list of recommended StockCodes
# user_recommendations = {}

# # Get all similar users' indices and their actual CustomerIDs
# similar_user_ids = user_item_matrix.index[similar_user_indices]

# # For quick access to all similar users' purchase data
# similar_users_matrix = user_item_matrix.loc[similar_user_ids]

# # Include target user in the source of recommendations
# source_users = pd.concat([similar_users_matrix, user_item_matrix.iloc[[target_user_index]]])

# # For each similar user, recommend products bought by others but not by them
# for user_id in similar_user_ids:
#     user_vector = user_item_matrix.loc[user_id]

#     # Other users (excluding the current user)
#     other_users = source_users.drop(index=user_id, errors='ignore')

#     # Items with Q > 0 in other users
#     popular_items = (other_users > 0).any(axis=0)

#     # Items with Q == 0 for current user
#     not_owned = user_vector == 0

#     # Recommend items that are popular in others and missing in this user
#     recommended_items = popular_items[popular_items & not_owned].index.tolist()

#     user_recommendations[user_id] = recommended_items

# # 🔍 Display all recommendations
# for user, items in user_recommendations.items():
#     print(f"User {user} recommended items: {items}")

In [18]:
df['CustomerID'].head()

Unnamed: 0,CustomerID
0,17850
1,17850
2,17850
3,17850
4,17850


In [19]:
def collaborative_filtering_recommendations(df, target_user_id, top_n=10):
    # Create user-item matrix
    user_item_matrix = df.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', aggfunc='mean').fillna(0)

    # Check if target user exists
    if target_user_id not in user_item_matrix.index:
        return f"CustomerID {target_user_id} not found."

    # Get target user’s district
    target_district = df[df['CustomerID'] == target_user_id]['District'].mode().values[0]

    # Filter to only customers in the same district
    same_district_users = df[df['District'] == target_district]['CustomerID'].unique()
    same_district_users = [uid for uid in same_district_users if uid in user_item_matrix.index and uid != target_user_id]

    # Create new matrix only for these users
    filtered_user_item_matrix = user_item_matrix.loc[[target_user_id] + same_district_users]

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(filtered_user_item_matrix)
    target_index = filtered_user_item_matrix.index.get_loc(target_user_id)
    similarities = similarity_matrix[target_index]

    # Get top 10 similar users (excluding self)
    similar_users = [(uid, similarities[i]) for i, uid in enumerate(filtered_user_item_matrix.index) if uid != target_user_id]
    top_similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[:10]
    top_user_ids = [uid for uid, _ in top_similar_users]

    # Recommendation logic
    target_user_row = user_item_matrix.loc[target_user_id]
    item_quantity_map = defaultdict(float)
    item_user_map = defaultdict(list)

    for uid in top_user_ids:
        similar_user_row = user_item_matrix.loc[uid]
        recommended_mask = (similar_user_row > 0) & (target_user_row == 0)
        recommended_items = user_item_matrix.columns[recommended_mask]

        for item in recommended_items:
            item_quantity_map[item] += similar_user_row[item]
            item_user_map[item].append(uid)

    # Build result
    if not item_quantity_map:
        return pd.DataFrame(columns=['StockCode', 'Description', 'TotalQuantity', 'Users', 'District'])

    rec_df = pd.DataFrame([
        {
            'StockCode': item,
            'TotalQuantity': item_quantity_map[item],
            'Users': item_user_map[item],
            'District': target_district
        }
        for item in item_quantity_map
    ])

    rec_df = rec_df.sort_values(by='TotalQuantity', ascending=False).head(top_n)
    rec_df = rec_df.merge(df[['StockCode', 'Description']].drop_duplicates(), on='StockCode', how='left')

    return rec_df[['StockCode', 'Description', 'TotalQuantity', 'Users', 'District']]

In [20]:
target_user_id = 17850
top_n = 5
result = collaborative_filtering_recommendations(df, target_user_id, top_n)
result

Unnamed: 0,StockCode,Description,TotalQuantity,Users,District
0,21733,RED HANGING HEART T-LIGHT HOLDER,64.0,"[13831, 14548]",Europe
1,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,48.0,[13831],Europe
2,23103,BELL HEART DECORATION,24.0,[13831],Europe
3,23103,JINGLE BELL HEART DECORATION,24.0,[13831],Europe
4,22716,CARD CIRCUS PARADE,12.0,[16690],Europe
5,84946,ANTIQUE SILVER TEA GLASS ETCHED,12.0,[16690],Europe
6,84946,ANTIQUE SILVER T-LIGHT GLASS,12.0,[16690],Europe


# Content-Based

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Description'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (399823, 1970)


In [68]:
def get_recommendations(input_description, data, tfidf_matrix, top_n=5):
    from sklearn.metrics.pairwise import linear_kernel

    # Preprocess input description
    clean_input = input_description.lower()
    clean_input = ''.join(char for char in clean_input if char.isalnum() or char.isspace())

    # Vectorize input
    input_vec = vectorizer.transform([clean_input])

    # Compute similarity between input and all items
    sim_scores = linear_kernel(input_vec, tfidf_matrix).flatten()
    sim_indices = sim_scores.argsort()[::-1]

    # Filter out exact duplicates of input description
    seen = set()
    recommendations = []
    for i in sim_indices:
        desc = data.iloc[i]['Description']
        if desc != clean_input and desc not in seen:
            seen.add(desc)
            recommendations.append(i)
        if len(recommendations) == top_n:
            break

    return data.iloc[recommendations][['Description']]

# CrewAI

In [21]:
%pip install -U --quiet crewai==0.76.2
%pip install -U --quiet langchain langgraph
%pip install transformers -U
%pip install groq

Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.2 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.21.2


In [24]:
!pip install crewai-tools

Collecting tokenizers<=0.20.3,>=0.13.2 (from chromadb>=0.4.22->crewai-tools)
  Using cached tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.2
    Uninstalling tokenizers-0.21.2:
      Successfully uninstalled tokenizers-0.21.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.53.2 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.20.3


In [60]:
!pip install langchain_groq

Collecting langchain_groq
  Downloading langchain_groq-0.3.6-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_groq-0.3.6-py3-none-any.whl (16 kB)
Installing collected packages: langchain_groq
Successfully installed langchain_groq-0.3.6


In [44]:
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get("Recomm")

In [63]:
from crewai import Agent, Task, Crew, LLM
from groq import Groq
from langchain_groq import ChatGroq

# llm = ChatGroq(
#     model="llama-3.1-70b-versatile",
#     temperature=0.5,
#     max_completion_tokens=1024,
#     top_p=0.9,
#     stop=None,
#     stream=False,
# )
llm=ChatGroq(temperature=0,
             model_name="llama3-70b-8192",
             api_key='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')

In [71]:
from langchain.tools import Tool

# Define agents

data_engineer = Agent(
    role='Data Engineer',
    goal='Prepare and optimize retail transaction data',
    backstory='Expert in data preprocessing and feature engineering',
    verbose=True,
    llm=llm,
    allow_delegation=False
)

data_prep_task = Task(
    description="""
    Prepare the retail transaction data for analysis:
    1. Handle missing values
    2. Ensure proper data types
    3. Validate district information
    4. Create any necessary derived features
    """,
    agent=data_engineer,
    expected_output="Cleaned DataFrame ready for recommendation analysis",
    output_file="preprocessed_data.csv"
)

# Create a Tool from the collaborative filtering function
cf_tool = Tool(
    name="collaborative_filtering_recommendations",
    func=collaborative_filtering_recommendations,
    description="Generates product recommendations using user-based collaborative filtering with district constraints"
)

# Define the CF specialist agent with the proper tool
cf_specialist = Agent(
    role='Collaborative Filtering Specialist',
    goal='Generate personalized product recommendations',
    backstory='Data scientist specializing in neighborhood-based recommendation systems',
    verbose=True,
    llm=llm,
    tools=[cf_tool]  # Now using the properly formatted Tool object
)

results_analyst = Agent(
    role='Recommendation Analyst',
    goal='Analyze and validate recommendation quality',
    backstory='Business analyst with expertise in evaluating recommendation systems',
    llm=llm,
    verbose=True
)

cf_recommendation_task = Task(
    description="""
    Generate product recommendations using collaborative filtering:
    1. Implement user-based filtering with district constraints
    2. Calculate user similarities within the same district
    3. Generate top product recommendations
    4. Include metadata about recommendation sources
    """,
    agent=cf_specialist,
    expected_output="DataFrame with top product recommendations including supporting metrics",
    context=[data_prep_task],
    output_file="cf_recommendations.csv"
)

analysis_task = Task(
    description="""
    Analyze the recommendation results:
    1. Evaluate recommendation diversity
    2. Check for business validity
    3. Suggest improvements to the algorithm
    4. Prepare summary statistics
    """,
    agent=results_analyst,
    expected_output="Report with recommendation analysis and improvement suggestions",
    context=[cf_recommendation_task],
    output_file="recommendation_analysis.md"
)

recommendation_crew = Crew(
    agents=[data_engineer, cf_specialist, results_analyst],
    tasks=[data_prep_task, cf_recommendation_task, analysis_task],
    verbose=True  # Changed from 2 to True
)

# Execute with sample input
inputs = {
    'df': df,
    'target_user_id': 17850  # Example customer from North district
}

results = recommendation_crew.kickoff(inputs=inputs)

# Print final recommendations
print("\n=== Final Recommendations ===")
print(results)

ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:Failed to get supported params: argument of type 'NoneType' is not iterable
ERROR:root:LiteLLM call failed: litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to cal


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

[1m[95m# Agent:[00m [1m[92mData Engineer[00m
[95m## Task:[00m [92m
    Prepare the retail transaction data for analysis:
    1. Handle missing values
    2. Ensure proper data types
    3. Validate district information
    4. Create any necessary derived features
    [00m

[1;31mProvider List: https://docs.litellm.ai/

ERROR:root:LiteLLM call failed: litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to call. You passed model=llama3-70b-8192
 Pass model as E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/starcoder',..)` Learn more: https://docs.litellm.ai/docs/providers


BadRequestError: litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to call. You passed model=llama3-70b-8192
 Pass model as E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/starcoder',..)` Learn more: https://docs.litellm.ai/docs/providers