In [None]:
!pip install torch torch-geometric pandas numpy scikit-learn scipy flask

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch_geometric.nn import LGConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')
import random
from datasets import load_dataset

In [None]:
ds_small = load_dataset(
    "mercari-us/merrec",
    split="train[:500000]",
    columns=[ "item_id", "name"]
)
df = pd.DataFrame(ds_small)
print(df.head())


Resolving data files:   0%|          | 0/2170 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/2170 [00:00<?, ?files/s]

20230601/000000000059.parquet:   0%|          | 0.00/76.8M [00:00<?, ?B/s]

20230601/000000000060.parquet:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

20230601/000000000061.parquet:   0%|          | 0.00/71.8M [00:00<?, ?B/s]

20230601/000000000062.parquet:   0%|          | 0.00/73.1M [00:00<?, ?B/s]

20230601/000000000063.parquet:   0%|          | 0.00/73.6M [00:00<?, ?B/s]

20230601/000000000064.parquet:   0%|          | 0.00/76.9M [00:00<?, ?B/s]

20230601/000000000065.parquet:   0%|          | 0.00/74.7M [00:00<?, ?B/s]

20230601/000000000066.parquet:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

20230601/000000000067.parquet:   0%|          | 0.00/71.8M [00:00<?, ?B/s]

20230601/000000000068.parquet:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

20230601/000000000069.parquet:   0%|          | 0.00/78.0M [00:00<?, ?B/s]

20230601/000000000070.parquet:   0%|          | 0.00/75.3M [00:00<?, ?B/s]

20230601/000000000071.parquet:   0%|          | 0.00/73.0M [00:00<?, ?B/s]

20230601/000000000072.parquet:   0%|          | 0.00/72.9M [00:00<?, ?B/s]

20230601/000000000073.parquet:   0%|          | 0.00/82.2M [00:00<?, ?B/s]

20230601/000000000074.parquet:   0%|          | 0.00/76.1M [00:00<?, ?B/s]

20230601/000000000075.parquet:   0%|          | 0.00/86.2M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
ds_stream = load_dataset(
    "mercari-us/merrec",
    split="train",
    streaming=True
)

# İlk 500k kayıt item_id ve name kolonlarıyla al
from itertools import islice
sample = list(islice(ds_stream, 500_000))
df = pd.DataFrame(sample)[["user_id","item_id", "name"]]
df_items_unique = df.drop_duplicates(subset=["item_id"])
print(df.head())
print(df_items_unique.head())
print("Toplam unique item sayısı:", len(df_items_unique))

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/2170 [00:00<?, ?it/s]

   user_id    item_id                                    name
0       45  238357102      Kaleidos Comet Catcher Highlighter
1       50  153080686   Gourmia French Door Xl Air Fryer Oven
2       56  228645576             Gold Heart Pendant Necklace
3       56   90449801     Lululemon Hotty Hot Shorts Low Rise
4       79  192635445  Nintendo 3DS XL PIKACHU YELLOW EDITION
   user_id    item_id                                    name
0       45  238357102      Kaleidos Comet Catcher Highlighter
1       50  153080686   Gourmia French Door Xl Air Fryer Oven
2       56  228645576             Gold Heart Pendant Necklace
3       56   90449801     Lululemon Hotty Hot Shorts Low Rise
4       79  192635445  Nintendo 3DS XL PIKACHU YELLOW EDITION
Toplam unique item sayısı: 360091


In [None]:
df_items_unique.to_csv("merrec_items.csv", index=False)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df_items_unique.to_csv("/content/drive/MyDrive/merrec_items.csv", index=False)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_items = df[["item_id", "name"]]

df_items_unique = df_items.drop_duplicates(subset=["item_id"])

print(df_items_unique.head())
print("Toplam unique item sayısı:", len(df_items_unique))

In [None]:
df.to_csv("merrec_sample.csv", index=False)


# Main Recommendation System Class

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, num_layers=3):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.convs = nn.ModuleList([nn.Linear(embedding_dim, embedding_dim) for _ in range(num_layers)])

    def forward(self, edge_index=None):
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        all_emb = torch.cat([user_emb, item_emb], dim=0)
        embs = [all_emb]
        for conv in self.convs:
            all_emb = conv(all_emb)
            embs.append(all_emb)
        final_emb = torch.stack(embs, dim=0).mean(dim=0)
        return final_emb[:self.num_users], final_emb[self.num_users:]

    def predict(self, users, items, user_emb, item_emb):
        return torch.sum(user_emb[users] * item_emb[items], dim=1)


class RecommendationSystem:
    def __init__(self, df):
        self.df = df.copy()
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        self.model = None
        self.edge_index = None
        self.inter_dict = {}
        self.num_users = 0
        self.num_items = 0

    def prepare_data(self, test_size=0.2):
        print("Veri hazırlama başlıyor...")

        # Encode
        self.df['user_id_enc'] = self.user_encoder.fit_transform(self.df['user_id'])
        self.df['item_id_enc'] = self.item_encoder.fit_transform(self.df['item_id'])

        # Store dimensions
        self.num_users = self.df['user_id_enc'].nunique()
        self.num_items = self.df['item_id_enc'].nunique()

        print(f"Toplam kullanıcı: {self.num_users}, Toplam ürün: {self.num_items}")

        # Minimum 2 etkileşim filtreleme
        user_counts = self.df['user_id_enc'].value_counts()
        valid_users = user_counts[user_counts >= 2].index
        self.df = self.df[self.df['user_id_enc'].isin(valid_users)]

        print(f"2+ etkileşimi olan kullanıcı sayısı: {len(valid_users)}")
        print(f"Filtreleme sonrası etkileşim sayısı: {len(self.df)}")

        # Kullanıcı -> ürün sözlüğü
        self.inter_dict = self.df.groupby('user_id_enc')['item_id_enc'].apply(list).to_dict()

        # Train-test split
        train_df, test_df = train_test_split(
            self.df, test_size=test_size, random_state=42, stratify=self.df['user_id_enc']
        )

        # Edge index for graph
        train_user_item = torch.stack([
            torch.tensor(train_df['user_id_enc'].values, dtype=torch.long),
            torch.tensor(train_df['item_id_enc'].values + self.num_users, dtype=torch.long)  # Offset items
        ])

        train_item_user = torch.stack([
            torch.tensor(train_df['item_id_enc'].values + self.num_users, dtype=torch.long),  # Offset items
            torch.tensor(train_df['user_id_enc'].values, dtype=torch.long)
        ])

        self.edge_index = torch.cat([train_user_item, train_item_user], dim=1)
        self.data = Data(edge_index=self.edge_index)

        return {
            'num_users': self.num_users,
            'num_items': self.num_items,
            'train_edges': self.edge_index,
            'test_edges': None
        }

    def _bpr_loss(self, user_emb, item_emb, batch_size=1024):
        device = user_emb.device

        valid_users = list(self.inter_dict.keys())
        # Valid user indices should be within bounds
        valid_users = [u for u in valid_users if u < self.num_users]

        if len(valid_users) == 0:
            return torch.tensor(0.0, device=device, requires_grad=True)

        batch_size = min(batch_size, len(valid_users))
        users_np = np.random.choice(valid_users, batch_size, replace=True)
        users = torch.tensor(users_np, device=device, dtype=torch.long)

        # Pozitif item seçimi
        pos_items_np = []
        for u in users_np:
            # Valid items within item embedding bounds
            valid_items = [i for i in self.inter_dict[u] if i < self.num_items]
            if len(valid_items) == 0:
                valid_items = [random.randint(0, self.num_items-1)]
            pos_items_np.append(random.choice(valid_items))
        pos_items = torch.tensor(pos_items_np, device=device, dtype=torch.long)

        # Negatif item seçimi - FIX: Ensure within bounds
        neg_items_np = []
        for u in users_np:
            max_attempts = 10
            attempts = 0
            while attempts < max_attempts:
                neg_i = random.randint(0, self.num_items-1)
                if neg_i not in self.inter_dict[u]:
                    break
                attempts += 1
            # Fallback: just use a random valid item if we can't find negative
            if attempts == max_attempts:
                neg_i = random.randint(0, self.num_items-1)
            neg_items_np.append(neg_i)
        neg_items = torch.tensor(neg_items_np, device=device, dtype=torch.long)

        # Ensure indices are within bounds
        users = torch.clamp(users, 0, self.num_users-1)
        pos_items = torch.clamp(pos_items, 0, self.num_items-1)
        neg_items = torch.clamp(neg_items, 0, self.num_items-1)

        pos_scores = self.model.predict(users, pos_items, user_emb, item_emb)
        neg_scores = self.model.predict(users, neg_items, user_emb, item_emb)

        loss = -F.logsigmoid(pos_scores - neg_scores).mean()
        return loss

    def train_model(self, embedding_dim=64, num_layers=3, epochs=50, lr=0.001, batch_size=1024):
        # Force CPU to avoid CUDA issues
        torch.backends.cudnn.enabled = False
        device = torch.device('cpu')
        torch.cuda.set_device(-1)  # Disable CUDA completely

        self.model = LightGCN(self.num_users, self.num_items, embedding_dim, num_layers)
        # Ensure model is on CPU
        self.model = self.model.cpu()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        print(f"Model oluşturuldu - Users: {self.num_users}, Items: {self.num_items}")
        print("Eğitim başlıyor...")

        for epoch in range(epochs):
            self.model.train()
            optimizer.zero_grad()
            user_emb, item_emb = self.model()
            loss = self._bpr_loss(user_emb, item_emb, batch_size)
            loss.backward()
            optimizer.step()

            if epoch % 10 == 0:
                print(f"Epoch {epoch:3d}/{epochs}, Loss: {loss.item():.4f}")

        self.model.eval()
        print("Eğitim tamamlandı!")
        return self.model

    def get_recommendations(self, user_id_original, k=5):
        """Orijinal user_id için öneri döndür"""
        if self.model is None:
            raise ValueError("Model henüz eğitilmedi!")

        device = torch.device('cpu')

        # Encode user_id
        try:
            user_encoded = self.user_encoder.transform([user_id_original])[0]
        except ValueError:
            print(f"Kullanıcı {user_id_original} eğitim setinde bulunamadı!")
            return []

        if user_encoded >= self.num_users:
            print(f"Encoded user ID {user_encoded} model boyutunu aşıyor!")
            return []

        # Get embeddings
        user_emb, item_emb = self.model()
        user_emb = user_emb.cpu()
        item_emb = item_emb.cpu()

        # Calculate scores
        user_vec = user_emb[user_encoded].unsqueeze(0)
        scores = torch.matmul(user_vec, item_emb.t()).squeeze().detach().numpy()

        # Remove interacted items
        if user_encoded in self.inter_dict:
            interacted_items = set(self.inter_dict[user_encoded])
            for i in interacted_items:
                if i < len(scores):
                    scores[i] = -np.inf

        # Get top-k
        top_k_idx = np.argsort(scores)[::-1][:k]

        recommendations = []
        for idx in top_k_idx:
            if idx < self.num_items:
                original_id = self.item_encoder.inverse_transform([idx])[0]
                score = scores[idx]
                if score > -np.inf:
                    recommendations.append({
                        'item_id': original_id,
                        'score': float(score)
                    })

        return recommendations


In [None]:
def main(df):
    """Ana pipeline"""
    print("LightGCN Öneri Sistemi - Pipeline Başlatılıyor")
    print("=" * 60)

    # Sistem oluştur
    recommender = RecommendationSystem(df)

    # 1. Veri hazırlama
    print("\n1. VERİ HAZIRLAMA")
    print("-" * 30)
    data_info = recommender.prepare_data(test_size=0.2)

    print(f"Hazırlama tamamlandı:")
    print(f"   - Kullanıcı sayısı: {data_info['num_users']:,}")
    print(f"   - Ürün sayısı: {data_info['num_items']:,}")
    print(f"   - Eğitim kenarı: {data_info['train_edges'].shape[1]:,}")

    # 2. Model eğitimi
    print(f"\n2. MODEL EĞİTİMİ")
    print("-" * 30)
    recommender.train_model(
        embedding_dim=64,  # Smaller for faster training
        num_layers=2,      # Fewer layers
        epochs=100,         # Fewer epochs for demo
        lr=0.001,
        batch_size=512     # Smaller batch
    )

    # 3. Örnek öneriler
    print(f"\n3. ÖRNEK ÖNERİLER")
    print("-" * 30)

    # Sample user from original IDs
    sample_original_user = df['user_id'].iloc[0]
    print(f"Örnek kullanıcı (orijinal ID): {sample_original_user}")

    recommendations = recommender.get_recommendations(sample_original_user, k=5)

    if recommendations:
        print(f"\n🌟 Top-5 önerilen ürün:")
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i}. Product ID: {rec['item_id']} | Skor: {rec['score']:.3f}")
    else:
        print("Bu kullanıcı için öneri bulunamadı")

    print(f"\nPipeline tamamlandı!")
    return recommender


In [None]:
# Test etmek için örnek veri
if __name__ == "__main__":
    print("Test verisi ile çalışılıyor...")
    recommender = main(df)

Test verisi ile çalışılıyor...
LightGCN Öneri Sistemi - Pipeline Başlatılıyor

1. VERİ HAZIRLAMA
------------------------------
Veri hazırlama başlıyor...
Toplam kullanıcı: 7494, Toplam ürün: 360091
2+ etkileşimi olan kullanıcı sayısı: 6435
Filtreleme sonrası etkileşim sayısı: 498941
Hazırlama tamamlandı:
   - Kullanıcı sayısı: 7,494
   - Ürün sayısı: 360,091
   - Eğitim kenarı: 798,304

2. MODEL EĞİTİMİ
------------------------------
Model oluşturuldu - Users: 7494, Items: 360091
Eğitim başlıyor...
Epoch   0/100, Loss: 1.2099
Epoch  10/100, Loss: 1.0158
Epoch  20/100, Loss: 1.0642
Epoch  30/100, Loss: 1.0772
Epoch  40/100, Loss: 0.9628
Epoch  50/100, Loss: 0.9581
Epoch  60/100, Loss: 0.8488
Epoch  70/100, Loss: 0.8737
Epoch  80/100, Loss: 0.9083
Epoch  90/100, Loss: 0.8588
Eğitim tamamlandı!

3. ÖRNEK ÖNERİLER
------------------------------
Örnek kullanıcı (orijinal ID): 45

🌟 Top-5 önerilen ürün:
   1. Product ID: 164799885 | Skor: 4.143
   2. Product ID: 23607574 | Skor: 4.059
   3.

In [None]:
torch.save(recommender.model.state_dict(), "lightgcnv2_model.pt")
print("Model kaydedildi: lightgcnv2_model.pt")

Model kaydedildi: lightgcnv2_model.pt


In [None]:
rec_sys.model = loaded_model  # RecommendationSystem içinde modeli değiştir
recommendations = rec_sys.get_recommendations(user_id_original=123, k=5)
print(recommendations)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Modeli Drive içine kaydet
torch.save(recommender.model.state_dict(), "/content/drive/MyDrive/lightgcnv2_model.pt")
print("Model Google Drive'a kaydedildi!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model Google Drive'a kaydedildi!
