In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# data loading

def load_data():
    clients = pd.read_csv('data/clients.csv')
    products = pd.read_csv('data/products.csv')
    transactions = pd.read_csv('data/transactions.csv')
    stores = pd.read_csv('data/stores.csv')
    stocks = pd.read_csv('data/stocks.csv')

    transactions['SaleTransactionDate'] = pd.to_datetime(
        transactions['SaleTransactionDate']
    )
    return clients, products, transactions, stores, stocks



# improved recommender

class FastRecommenderImproved:
    """improved cold start recommender"""

    def __init__(self, clients, products, transactions, stocks):
        self.clients = clients.copy()
        self.products = products.copy()
        self.transactions = transactions.copy()
        self.stocks = stocks.copy()

        # fill missing age values
        self.clients['Age'] = self.clients['Age'].fillna(-1)

        # detailed profiles
        self.clients['Age_Group'] = pd.cut(
            self.clients['Age'],
            bins=[0, 25, 35, 50, 100],
            labels=['18-25', '26-35', '36-50', '50+']
        ).astype(str)

        self.clients.loc[
            self.clients['Age'] == -1, 'Age_Group'
        ] = 'Unknown'

        self.clients['Profile'] = (
            self.clients['ClientCountry'] + '_' +
            self.clients['ClientGender'] + '_' +
            self.clients['Age_Group'] + '_' +
            self.clients['ClientSegment']
        )

        self.clients['Profile_NoAge'] = (
            self.clients['ClientCountry'] + '_' +
            self.clients['ClientGender'] + '_' +
            self.clients['ClientSegment']
        )

        # recency-weighted popularity
        now = self.transactions['SaleTransactionDate'].max()
        self.transactions['days_ago'] = (
            now - self.transactions['SaleTransactionDate']
        ).dt.days
        self.transactions['weight'] = np.exp(
            -self.transactions['days_ago'] / 180
        )

        # popularity by full profile
        self.profile_pop = (
            self.transactions
            .merge(
                self.clients[['ClientID', 'Profile', 'Profile_NoAge']],
                on='ClientID',
                how='left'
            )
            .groupby(['Profile', 'ProductID'])['weight']
            .sum()
            .reset_index(name='score')
        )

        # popularity by profile without age
        self.profile_pop_noage = (
            self.transactions
            .merge(
                self.clients[['ClientID', 'Profile_NoAge']],
                on='ClientID',
                how='left'
            )
            .groupby(['Profile_NoAge', 'ProductID'])['weight']
            .sum()
            .reset_index(name='score')
        )

        # global popularity
        self.global_pop = (
            self.transactions
            .groupby('ProductID')['weight']
            .sum()
            .to_dict()
        )

        # product similarity
        self.prod_family = (
            products
            .set_index('ProductID')[['FamilyLevel1', 'FamilyLevel2']]
            .to_dict(orient='index')
        )

        # available products by country
        stock_avail = (
            stocks
            .groupby(['StoreCountry', 'ProductID'])
            .agg({'Quantity': 'sum'})
            .reset_index()
        )
        stock_avail = stock_avail[stock_avail['Quantity'] > 0]

        self.stock_by_country = (
            stock_avail
            .groupby('StoreCountry')['ProductID']
            .apply(set)
            .to_dict()
        )

    def predict(self, prospect_id, top_n=10):
        client = self.clients[self.clients['ClientID'] == prospect_id]
        if client.empty:
            return []

        client = client.iloc[0]
        country = client['ClientCountry']
        profile = client['Profile']
        profile_noage = client['Profile_NoAge']

        available = self.stock_by_country.get(
            country,
            set(self.products['ProductID'].unique())
        )

        scores = defaultdict(float)

        # full profile popularity
        prof_prods = self.profile_pop[
            self.profile_pop['Profile'] == profile
        ]

        if not prof_prods.empty:
            max_score = prof_prods['score'].max()
            for _, row in prof_prods.iterrows():
                if row['ProductID'] in available:
                    scores[row['ProductID']] += row['score'] / max_score
        else:
            # fallback without age
            prof_prods = self.profile_pop_noage[
                self.profile_pop_noage['Profile_NoAge'] == profile_noage
            ]
            if not prof_prods.empty:
                max_score = prof_prods['score'].max()
                for _, row in prof_prods.iterrows():
                    if row['ProductID'] in available:
                        scores[row['ProductID']] += (
                            0.7 * row['score'] / max_score
                        )

        # fast product similarity
        top_profile_products = (
            prof_prods
            .sort_values('score', ascending=False)['ProductID']
            .head(5)
            .tolist()
        )

        for pid in top_profile_products:
            fam = self.prod_family.get(pid, {})
            for other_pid, f in self.prod_family.items():
                if (
                    other_pid in available
                    and other_pid not in scores
                ):
                    if (
                        f.get('FamilyLevel1') == fam.get('FamilyLevel1')
                        or f.get('FamilyLevel2') == fam.get('FamilyLevel2')
                    ):
                        scores[other_pid] += 0.3

        # ranking
        ranked = sorted(
            scores.items(),
            key=lambda x: x[1],
            reverse=True
        )

        recs = []
        for pid, _ in ranked:
            if pid not in recs:
                recs.append(pid)
            if len(recs) >= top_n:
                break

        # global fallback
        if len(recs) < top_n:
            global_sorted = sorted(
                self.global_pop.items(),
                key=lambda x: x[1],
                reverse=True
            )
            for pid, _ in global_sorted:
                if pid not in recs and pid in available:
                    recs.append(pid)
                if len(recs) >= top_n:
                    break

        return recs[:top_n]


# cold start evaluation on loyal clients

def eval_cold_start(model, clients, transactions, top_n=10, sample=200):
    """evaluates hit rate@top_n on loyal clients with history"""

    counts = transactions.groupby('ClientID').size()
    multi = counts[counts >= 3].index

    loyal = clients[
        (clients['ClientSegment'] == 'LOYAL') &
        (clients['ClientID'].isin(multi))
    ]['ClientID'].values

    test_clients = np.random.choice(
        loyal,
        min(sample, len(loyal)),
        replace=False
    )

    hits = []
    for cid in test_clients:
        client_trans = (
            transactions[transactions['ClientID'] == cid]
            .sort_values('SaleTransactionDate')
            .head(5)
        )
        ground_truth = client_trans['ProductID'].unique()
        if len(ground_truth) == 0:
            continue

        preds = model.predict(cid, top_n=top_n)
        hit = int(any(p in preds for p in ground_truth))
        hits.append(hit)

    hit_rate = np.mean(hits) if hits else 0
    print(
        f"\nhit rate@{top_n}: "
        f"{hit_rate:.3f} ({hit_rate*100:.1f}%) "
        f"on {len(hits)} test clients"
    )
    return hit_rate


# main function

def main():
    print("\ncold start recommender")

    print("loading data...")
    clients, products, transactions, stores, stocks = load_data()
    print(f"   {len(clients):,} clients")
    print(f"   {len(transactions):,} transactions")

    print("\nbuilding model...")
    model = FastRecommenderImproved(
        clients, products, transactions, stocks
    )
    print("   model ready")

    hit_rate = eval_cold_start(
        model, clients, transactions, top_n=10, sample= 400
    )

    sample_prospect = (
        clients[clients['ClientSegment'] == 'PROSPECT']
        .sample(1)
        .iloc[0]
    )

    print(
        f"\nexample recommendations for prospect "
        f"{sample_prospect['ClientID']}:"
    )
    print(
        f"country: {sample_prospect['ClientCountry']}, "
        f"gender: {sample_prospect['ClientGender']}, "
        f"age: {sample_prospect['Age']}"
    )

    recs = model.predict(sample_prospect['ClientID'], top_n=5)
    for i, pid in enumerate(recs, 1):
        prod = products[products['ProductID'] == pid]
        if not prod.empty:
            prod = prod.iloc[0]
            print(
                f"   {i}. "
                f"{prod['Category']} - {prod['FamilyLevel2']}"
            )

    print("\nfinished")
    return model, hit_rate


if __name__ == "__main__":
    model, hit_rate = main()



cold start recommender
loading data...
   424,037 clients
   1,177,175 transactions

building model...
   model ready

hit rate@10: 0.150 (15.0%) on 400 test clients

example recommendations for prospect 2612415748174957955:
country: FRA, gender: nan, age: nan
   1. Football - Puma Third Jersey
   2. Basketball - Spalding NBA Official Game Ball
   3. Football - Nike Dri-FIT
   4. Football - Nike Ordem V
   5. Football - Puma Future

finished


Same version but with the top 10

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# data loading

def load_data():
    clients = pd.read_csv('data/clients.csv')
    products = pd.read_csv('data/products.csv')
    transactions = pd.read_csv('data/transactions.csv')
    stores = pd.read_csv('data/stores.csv')
    stocks = pd.read_csv('data/stocks.csv')

    transactions['SaleTransactionDate'] = pd.to_datetime(
        transactions['SaleTransactionDate']
    )
    return clients, products, transactions, stores, stocks



# improved recommender

class FastRecommenderImproved:
    """improved cold start recommender"""

    def __init__(self, clients, products, transactions, stocks):
        self.clients = clients.copy()
        self.products = products.copy()
        self.transactions = transactions.copy()
        self.stocks = stocks.copy()

        # fill missing age values
        self.clients['Age'] = self.clients['Age'].fillna(-1)

        # detailed profiles
        self.clients['Age_Group'] = pd.cut(
            self.clients['Age'],
            bins=[0, 25, 35, 50, 100],
            labels=['18-25', '26-35', '36-50', '50+']
        ).astype(str)

        self.clients.loc[
            self.clients['Age'] == -1, 'Age_Group'
        ] = 'Unknown'

        self.clients['Profile'] = (
            self.clients['ClientCountry'] + '_' +
            self.clients['ClientGender'] + '_' +
            self.clients['Age_Group'] + '_' +
            self.clients['ClientSegment']
        )

        self.clients['Profile_NoAge'] = (
            self.clients['ClientCountry'] + '_' +
            self.clients['ClientGender'] + '_' +
            self.clients['ClientSegment']
        )

        # recency-weighted popularity
        now = self.transactions['SaleTransactionDate'].max()
        self.transactions['days_ago'] = (
            now - self.transactions['SaleTransactionDate']
        ).dt.days
        self.transactions['weight'] = np.exp(
            -self.transactions['days_ago'] / 180
        )

        # popularity by full profile
        self.profile_pop = (
            self.transactions
            .merge(
                self.clients[['ClientID', 'Profile', 'Profile_NoAge']],
                on='ClientID',
                how='left'
            )
            .groupby(['Profile', 'ProductID'])['weight']
            .sum()
            .reset_index(name='score')
        )

        # popularity by profile without age
        self.profile_pop_noage = (
            self.transactions
            .merge(
                self.clients[['ClientID', 'Profile_NoAge']],
                on='ClientID',
                how='left'
            )
            .groupby(['Profile_NoAge', 'ProductID'])['weight']
            .sum()
            .reset_index(name='score')
        )

        # global popularity
        self.global_pop = (
            self.transactions
            .groupby('ProductID')['weight']
            .sum()
            .to_dict()
        )

        # product similarity
        self.prod_family = (
            products
            .set_index('ProductID')[['FamilyLevel1', 'FamilyLevel2']]
            .to_dict(orient='index')
        )

        # available products by country
        stock_avail = (
            stocks
            .groupby(['StoreCountry', 'ProductID'])
            .agg({'Quantity': 'sum'})
            .reset_index()
        )
        stock_avail = stock_avail[stock_avail['Quantity'] > 0]

        self.stock_by_country = (
            stock_avail
            .groupby('StoreCountry')['ProductID']
            .apply(set)
            .to_dict()
        )

    def predict(self, prospect_id, top_n=10):
        client = self.clients[self.clients['ClientID'] == prospect_id]
        if client.empty:
            return []

        client = client.iloc[0]
        country = client['ClientCountry']
        profile = client['Profile']
        profile_noage = client['Profile_NoAge']

        available = self.stock_by_country.get(
            country,
            set(self.products['ProductID'].unique())
        )

        scores = defaultdict(float)

        # full profile popularity
        prof_prods = self.profile_pop[
            self.profile_pop['Profile'] == profile
        ]

        if not prof_prods.empty:
            max_score = prof_prods['score'].max()
            for _, row in prof_prods.iterrows():
                if row['ProductID'] in available:
                    scores[row['ProductID']] += row['score'] / max_score
        else:
            # fallback without age
            prof_prods = self.profile_pop_noage[
                self.profile_pop_noage['Profile_NoAge'] == profile_noage
            ]
            if not prof_prods.empty:
                max_score = prof_prods['score'].max()
                for _, row in prof_prods.iterrows():
                    if row['ProductID'] in available:
                        scores[row['ProductID']] += (
                            0.7 * row['score'] / max_score
                        )

        # fast product similarity
        top_profile_products = (
            prof_prods
            .sort_values('score', ascending=False)['ProductID']
            .head(5)
            .tolist()
        )

        for pid in top_profile_products:
            fam = self.prod_family.get(pid, {})
            for other_pid, f in self.prod_family.items():
                if (
                    other_pid in available
                    and other_pid not in scores
                ):
                    if (
                        f.get('FamilyLevel1') == fam.get('FamilyLevel1')
                        or f.get('FamilyLevel2') == fam.get('FamilyLevel2')
                    ):
                        scores[other_pid] += 0.3

        # ranking
        ranked = sorted(
            scores.items(),
            key=lambda x: x[1],
            reverse=True
        )

        recs = []
        for pid, _ in ranked:
            if pid not in recs:
                recs.append(pid)
            if len(recs) >= top_n:
                break

        # global fallback
        if len(recs) < top_n:
            global_sorted = sorted(
                self.global_pop.items(),
                key=lambda x: x[1],
                reverse=True
            )
            for pid, _ in global_sorted:
                if pid not in recs and pid in available:
                    recs.append(pid)
                if len(recs) >= top_n:
                    break

        return recs[:top_n]



# cold start evaluation on loyal clients

def eval_cold_start(model, clients, transactions, top_n=10, sample=200):
    """evaluates hit rate@top_n on loyal clients with history"""

    counts = transactions.groupby('ClientID').size()
    multi = counts[counts >= 3].index

    loyal = clients[
        (clients['ClientSegment'] == 'LOYAL') &
        (clients['ClientID'].isin(multi))
    ]['ClientID'].values

    test_clients = np.random.choice(
        loyal,
        min(sample, len(loyal)),
        replace=False
    )

    hits = []
    for cid in test_clients:
        client_trans = (
            transactions[transactions['ClientID'] == cid]
            .sort_values('SaleTransactionDate')
            .head(5)
        )
        ground_truth = client_trans['ProductID'].unique()
        if len(ground_truth) == 0:
            continue

        preds = model.predict(cid, top_n=top_n)
        hit = int(any(p in preds for p in ground_truth))
        hits.append(hit)

    hit_rate = np.mean(hits) if hits else 0
    print(
        f"\nhit rate@{top_n}: "
        f"{hit_rate:.3f} ({hit_rate*100:.1f}%) "
        f"on {len(hits)} test clients"
    )
    return hit_rate


# main function

def main():
    print("\ncold start recommender")

    print("loading data...")
    clients, products, transactions, stores, stocks = load_data()
    print(f"   {len(clients):,} clients")
    print(f"   {len(transactions):,} transactions")

    print("\nbuilding model...")
    model = FastRecommenderImproved(
        clients, products, transactions, stocks
    )
    print("   model ready")

    hit_rate = eval_cold_start(
        model, clients, transactions, top_n=10, sample=200
    )

    sample_prospect = (
        clients[clients['ClientSegment'] == 'PROSPECT']
        .sample(1)
        .iloc[0]
    )

    print(
        f"\nexample recommendations for prospect "
        f"{sample_prospect['ClientID']}:"
    )
    print(
        f"country: {sample_prospect['ClientCountry']}, "
        f"gender: {sample_prospect['ClientGender']}, "
        f"age: {sample_prospect['Age']}"
    )

    recs = model.predict(sample_prospect['ClientID'], top_n=10)
    for i, pid in enumerate(recs, 1):
        prod = products[products['ProductID'] == pid]
        if not prod.empty:
            prod = prod.iloc[0]
            print(
                f"   {i}. "
                f"{prod['Category']} - {prod['FamilyLevel2']}"
            )

    print("\nfinished")
    return model, hit_rate


if __name__ == "__main__":
    model, hit_rate = main()



cold start recommender
loading data...
   424,037 clients
   1,177,175 transactions

building model...
   model ready

hit rate@10: 0.135 (13.5%) on 200 test clients

example recommendations for prospect 3766080025391802322:
country: FRA, gender: F, age: nan
   1. Football - Puma Third Jersey
   2. Basketball - Spalding NBA Official Game Ball
   3. Football - Nike Dri-FIT
   4. Football - Nike Ordem V
   5. Football - Puma Future
   6. Basketball - Wilson Evolution Basketball
   7. Football - Adidas Squadra 21
   8. Tennis - Wilson US Open
   9. Tennis - Wilson US Open
   10. Football - Puma Third Jersey

finished
