In [1]:
import gzip
import json
from collections import Counter
import pandas as pd

In [2]:

# def get_shared_users(source_path, target_path):
#     print("Kaynak domain kullanıcıları taranıyor...")
#     source_users = set()
#     with gzip.open(source_path, 'rt', encoding='utf-8') as f:
#         for line in f:
#             data = json.loads(line)
#             source_users.add(data['user_id'])
            
#     print("Hedef domain kullanıcıları ve çakışmalar taranıyor...")
#     shared_users = set()
#     with gzip.open(target_path, 'rt', encoding='utf-8') as f:
#         for line in f:
#             data = json.loads(line)
#             if data['user_id'] in source_users:
#                 shared_users.add(data['user_id'])
    
#     print(f"Toplam Ortak Kullanıcı Sayısı: {len(shared_users)}")
#     return shared_users

# # Dosya yollarını güncelleyin
# source_domain_file = "Books.jsonl.gz"
# target_domain_file = "Electronics.jsonl.gz"

# shared_users = get_shared_users(source_domain_file, target_domain_file)

In [3]:


# def filter_and_save_data(path, shared_users, output_name, min_inter=20): # min_iter=5
#     extracted_data = []
#     user_counts = Counter()
    
#     print(f"{output_name} işleniyor...")
#     with gzip.open(path, 'rt', encoding='utf-8') as f:
#         for line in f:
#             data = json.loads(line)
#             # Sadece ortak kullanıcıları veya hedef domaindeki etkileşimleri al
#             if data['user_id'] in shared_users:
#                 extracted_data.append({
#                     'user_id': data['user_id'],
#                     'item_id': data['parent_asin'], # Amazon 2023'te parent_asin kullanımı önerilir
#                     'rating': data['rating'],
#                     'timestamp': data['timestamp']
#                 })
#                 user_counts[data['user_id']] += 1
                
#     df = pd.DataFrame(extracted_data)
    
#     # 5-core filtering: En az 5 yorumu olan kullanıcıları tut (Akademik standart)
#     df = df[df.groupby('user_id')['user_id'].transform('count') >= min_inter]
    
#     df.to_csv(f"{output_name}_filtered.csv", index=False)
#     print(f"{output_name} kaydedildi. Satır sayısı: {len(df)}")

# # Kullanım:
# filter_and_save_data(source_domain_file, shared_users, "source_books")
# filter_and_save_data(target_domain_file, shared_users, "target_electronics")

In [4]:
def analyze_sparsity(csv_path):
    df = pd.read_csv(csv_path)
    u = df['user_id'].nunique()
    i = df['item_id'].nunique()
    r = len(df)
    sparsity = (1 - (r / (u * i))) * 100
    print(f"\n--- {csv_path} Analizi ---")
    print(f"Kullanıcı: {u} | Ürün: {i} | Etkileşim: {r}")
    print(f"Sparsity: %{sparsity:.4f}")

analyze_sparsity("source_books_filtered.csv")
analyze_sparsity("target_electronics_filtered.csv")


--- source_books_filtered.csv Analizi ---
Kullanıcı: 88749 | Ürün: 1459222 | Etkileşim: 4287071
Sparsity: %99.9967

--- target_electronics_filtered.csv Analizi ---
Kullanıcı: 80650 | Ürün: 489371 | Etkileşim: 2801916
Sparsity: %99.9929


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, HeteroConv
from torch.autograd import Function

# 1. Gradient Reversal Layer (GRL) - Adversarial Learning için
class GRL(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        # Geriye yayılımda gradyanı ters çevirir (-alpha ile çarpar)
        return grad_output.neg() * ctx.alpha, None

# 2. GAT-Based Encoder (Niyet Yakalayıcı)
class DualIntentEncoder(nn.Module):
    def __init__(self, hidden_channels, out_channels, heads=4):
        super().__init__()
        # Katman 1: Çok kafalı dikkat (Multi-head attention)
        self.conv1 = HeteroConv({
            ('user', 'interacts', 'item'): GATConv((-1, -1), hidden_channels, heads=heads, add_self_loops=False),
            ('item', 'rev_interacts', 'user'): GATConv((-1, -1), hidden_channels, heads=heads, add_self_loops=False),
        }, aggr='mean')
        
        # Katman 2: Çıktı katmanı
        self.conv2 = HeteroConv({
            ('user', 'interacts', 'item'): GATConv((-1 * heads, -1 * heads), out_channels, heads=1, add_self_loops=False),
            ('item', 'rev_interacts', 'user'): GATConv((-1 * heads, -1 * heads), out_channels, heads=1, add_self_loops=False),
        }, aggr='mean')

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.elu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict # {'user': [N, out_dim], 'item': [M, out_dim]}

# 3. Domain Discriminator (Domain Bağımsız Özellik Öğrenme)
class DomainDiscriminator(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 64),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x, alpha):
        x = GRL.apply(x, alpha)
        return self.net(x)

# 4. Contrastive Alignment Loss (InfoNCE)
def info_nce_loss(z_source, z_target, shared_users, temperature=0.1):
    """
    Shared users üzerinden iki domaini hizalar.
    z_source/target: Tüm user embeddingleri
    shared_users: [shared_count, 2] -> [[source_idx, target_idx], ...]
    """
    z_s = z_source[shared_users[:, 0]]
    z_t = z_target[shared_users[:, 1]]
    
    z_s = F.normalize(z_s, dim=1)
    z_t = F.normalize(z_t, dim=1)
    
    # Benzerlik matrisi (N x N)
    logits = torch.matmul(z_s, z_t.T) / temperature
    labels = torch.arange(z_s.size(0)).to(z_s.device)
    
    # Simetrik Loss (S -> T ve T -> S)
    loss_i2j = F.cross_entropy(logits, labels)
    loss_j2i = F.cross_entropy(logits.T, labels)
    
    return (loss_i2j + loss_j2i) / 2

# 5. Model Birleştirici (Main Wrapper)
class CDRModel(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.encoder = DualIntentEncoder(32, emb_dim)
        self.discriminator = DomainDiscriminator(emb_dim)
        # Rating tahmini için basit MLP
        self.predictor = nn.Sequential(
            nn.Linear(emb_dim * 2, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def predict(self, user_emb, item_emb, edges):
        u_idx, i_idx = edges[0], edges[1]
        cat_feat = torch.cat([user_emb[u_idx], item_emb[i_idx]], dim=1)
        return self.predictor(cat_feat).squeeze()

In [6]:
import pandas as pd

# Filtrelediğimiz dosyaları geri yüklüyoruz
print("Veriler yükleniyor...")
books_df = pd.read_csv("source_books_filtered.csv")
elec_df = pd.read_csv("target_electronics_filtered.csv")



Veriler yükleniyor...


In [7]:
import torch
import pandas as pd

def create_shared_user_mapping(source_df, target_df):
    # Her iki domaindeki benzersiz kullanıcıları al
    source_users = source_df['user_id'].unique()
    target_users = target_df['user_id'].unique()
    
    # ID -> Index eşlemesi oluştur
    s_user2idx = {id: i for i, id in enumerate(source_users)}
    t_user2idx = {id: i for i, id in enumerate(target_users)}
    
    # Ortak kullanıcıları bul ve index çiftlerini oluştur
    shared_ids = set(source_users).intersection(set(target_users))
    shared_mapping = []
    for uid in shared_ids:
        shared_mapping.append([s_user2idx[uid], t_user2idx[uid]])
    
    return torch.tensor(shared_mapping, dtype=torch.long), s_user2idx, t_user2idx

# Kullanım:
# Şimdi hata aldığınız fonksiyonu çağırabilirsiniz
shared_idx_tensor, s_map, t_map = create_shared_user_mapping(books_df, elec_df)
print(f"Eşleşme tamamlandı. Ortak kullanıcı tensör boyutu: {shared_idx_tensor.shape}")

Eşleşme tamamlandı. Ortak kullanıcı tensör boyutu: torch.Size([7648, 2])


In [8]:
from torch_geometric.data import HeteroData

def build_graph(df, user_map, item_name='item'):
    data = HeteroData()
    
    # Ürün ID'lerini indexle
    items = df['item_id'].unique()
    item_map = {id: i for i, id in enumerate(items)}
    
    # Kenarları (Edges) oluştur
    u_idx = [user_map[uid] for uid in df['user_id']]
    i_idx = [item_map[iid] for iid in df['item_id']]
    
    edge_index = torch.tensor([u_idx, i_idx], dtype=torch.long)
    edge_attr = torch.tensor(df['rating'].values, dtype=torch.float)
    
    # Düğüm özelliklerini (features) basitçe ilklendir (Q1 için burası daha sonra zenginleştirilecek)
    data['user'].x = torch.randn(len(user_map), 16)
    data['item'].x = torch.randn(len(item_map), 16)
    
    data['user', 'interacts', 'item'].edge_index = edge_index
    data['user', 'interacts', 'item'].edge_attr = edge_attr
    
    # Ters kenarları ekle (GNN mesaj iletimi için şart)
    data['item', 'rev_interacts', 'user'].edge_index = edge_index[[1, 0]]
    
    return data

# Grafikleri oluştur
books_graph = build_graph(books_df, s_map)
elec_graph = build_graph(elec_df, t_map)

In [9]:
import torch
import torch.optim as optim

# 1. Veriyi Hazırlama (Önceki adımda hata aldığınız yerin düzeltilmiş hali)
print("Veriler CSV'den yükleniyor...")
books_df = pd.read_csv("source_books_filtered.csv")
elec_df = pd.read_csv("target_electronics_filtered.csv")

# Ortak kullanıcı haritasını oluştur
shared_idx_tensor, s_map, t_map = create_shared_user_mapping(books_df, elec_df)

# Grafikleri oluştur (Daha önce verdiğim build_graph fonksiyonunu kullanın)
books_graph = build_graph(books_df, s_map)
elec_graph = build_graph(elec_df, t_map)

# 2. NeighborLoader: Büyük veri için hayati önem taşıyan örnekleyici
# Her batch'te 1024 kullanıcı ve onların 2 katmanlı komşuları (15 ve 10 komşu) alınır.
from torch_geometric.loader import LinkNeighborLoader

# Books (Kaynak) için
train_loader_s = LinkNeighborLoader(
    books_graph,
    num_neighbors=[10, 5], 
    batch_size=2048, # Batch size'ı artırmak GPU paralelliğini artırır, süreyi kısaltır
    edge_label_index=(('user', 'interacts', 'item'), books_graph['user', 'interacts', 'item'].edge_index),
    edge_label=books_graph['user', 'interacts', 'item'].edge_attr,
    shuffle=True,
    num_workers=4, # CPU çekirdeklerini kullan
    persistent_workers=True # Her batch'te worker'ları yeniden başlatma
)
# train_loader_s = LinkNeighborLoader(
#     books_graph,
#     num_neighbors=[10, 5], # Büyük veri için [15, 10] yerine daha hızlı [10, 5] öneririm
#     batch_size=1024,
#     edge_label_index=(('user', 'interacts', 'item'), books_graph['user', 'interacts', 'item'].edge_index),
#     edge_label=books_graph['user', 'interacts', 'item'].edge_attr,
#     shuffle=True,
#     # Önemli: 'neighbor_sampler' kütüphane hatasını aşmak için:
#     subgraph_type='induced' 
# )

# Electronics (Hedef) için
train_loader_t = LinkNeighborLoader(
    elec_graph,
    num_neighbors=[10, 5],
    batch_size=2048, # Süreyi kısaltmak için batch size artırıldı
    edge_label_index=(('user', 'interacts', 'item'), elec_graph['user', 'interacts', 'item'].edge_index),
    # EKSİK OLAN SATIR BURASIYDI:
    edge_label=elec_graph['user', 'interacts', 'item'].edge_attr, 
    shuffle=False, # Değerlendirme yaparken karıştırmaya gerek yok
    subgraph_type='induced'
)

Veriler CSV'den yükleniyor...


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CDRModel(emb_dim=64).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

def train():
    model.train()
    total_loss = 0
    alpha = 0.1 
    
    loader_t_iter = iter(train_loader_t)
    
    for i, batch_s in enumerate(train_loader_s):
        try:
            batch_t = next(loader_t_iter)
        except StopIteration:
            loader_t_iter = iter(train_loader_t)
            batch_t = next(loader_t_iter)
            
        batch_s, batch_t = batch_s.to(device), batch_t.to(device)
        optimizer.zero_grad()
        
        # 1. Forward Pass (Encoder)
        z_s_dict = model.encoder(batch_s.x_dict, batch_s.edge_index_dict)
        z_t_dict = model.encoder(batch_t.x_dict, batch_t.edge_index_dict)
        
        # 2. Rating Prediction Loss (Sadece Books Üzerinden)
        # LinkNeighborLoader'da seçilen kenarlar 'edge_label_index' içindedir
        edge_index_s = batch_s['user', 'interacts', 'item'].edge_label_index
        labels_s = batch_s['user', 'interacts', 'item'].edge_label
        
        preds_s = model.predict(z_s_dict['user'], z_s_dict['item'], edge_index_s)
        loss_task = F.mse_loss(preds_s, labels_s)
        
        # 3. Adversarial Loss (Domain Invariance)
        d_s = model.discriminator(z_s_dict['user'], alpha)
        d_t = model.discriminator(z_t_dict['user'], alpha)
        loss_adv = F.binary_cross_entropy(d_s, torch.zeros_like(d_s)) + \
                   F.binary_cross_entropy(d_t, torch.ones_like(d_t))
        
        # Toplam Kayıp
        loss = loss_task + 0.1 * loss_adv
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if i % 100 == 0:
            print(f"Batch {i} | Loss: {loss.item():.4f}")
            
    return total_loss / (i + 1)
# Eğitimi Başlat
print(f"Eğitim {device} üzerinde başlıyor...")
for epoch in range(1, 11):
    loss = train()
    print(f"Epoch: {epoch:02d}, Loss: {loss:.4f}")

Eğitim cuda üzerinde başlıyor...
Batch 0 | Loss: 18.8296
Batch 100 | Loss: 1.3756
Batch 200 | Loss: 1.1455
Batch 300 | Loss: 1.0364
Batch 400 | Loss: 1.0725
Batch 500 | Loss: 1.0161
Batch 600 | Loss: 1.0210
Batch 700 | Loss: 0.9843
Batch 800 | Loss: 1.0222
Batch 900 | Loss: 1.0674
Batch 1000 | Loss: 0.9667
Batch 1100 | Loss: 1.0292
Batch 1200 | Loss: 0.9339
Batch 1300 | Loss: 0.9714
Batch 1400 | Loss: 1.0597
Batch 1500 | Loss: 1.0099
Batch 1600 | Loss: 1.0082
Batch 1700 | Loss: 1.0448
Batch 1800 | Loss: 1.1163
Batch 1900 | Loss: 0.9656
Batch 2000 | Loss: 1.0232
Epoch: 01, Loss: 1.2056
Batch 0 | Loss: 1.0502
Batch 100 | Loss: 1.0887
Batch 200 | Loss: 1.0484
Batch 300 | Loss: 1.0881
Batch 400 | Loss: 0.9572
Batch 500 | Loss: 1.0143
Batch 600 | Loss: 1.1152
Batch 700 | Loss: 1.0058
Batch 800 | Loss: 1.1202
Batch 900 | Loss: 1.0685
Batch 1000 | Loss: 1.0590
Batch 1100 | Loss: 1.0345
Batch 1200 | Loss: 1.0157
Batch 1300 | Loss: 1.0140
Batch 1400 | Loss: 0.9649
Batch 1500 | Loss: 0.9275
Batc

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CDRModel(emb_dim=64).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

def train():
    model.train()
    total_loss = 0
    alpha = 0.1 
    
    loader_t_iter = iter(train_loader_t)
    
    for i, batch_s in enumerate(train_loader_s):
        try:
            batch_t = next(loader_t_iter)
        except StopIteration:
            loader_t_iter = iter(train_loader_t)
            batch_t = next(loader_t_iter)
            
        batch_s, batch_t = batch_s.to(device), batch_t.to(device)
        optimizer.zero_grad()
        
        # 1. Forward Pass (Encoder)
        z_s_dict = model.encoder(batch_s.x_dict, batch_s.edge_index_dict)
        z_t_dict = model.encoder(batch_t.x_dict, batch_t.edge_index_dict)
        
        # 2. Rating Prediction Loss (Sadece Books Üzerinden)
        # LinkNeighborLoader'da seçilen kenarlar 'edge_label_index' içindedir
        edge_index_s = batch_s['user', 'interacts', 'item'].edge_label_index
        labels_s = batch_s['user', 'interacts', 'item'].edge_label
        
        preds_s = model.predict(z_s_dict['user'], z_s_dict['item'], edge_index_s)
        loss_task = F.mse_loss(preds_s, labels_s)
        
        # 3. Adversarial Loss (Domain Invariance)
        d_s = model.discriminator(z_s_dict['user'], alpha)
        d_t = model.discriminator(z_t_dict['user'], alpha)
        loss_adv = F.binary_cross_entropy(d_s, torch.zeros_like(d_s)) + \
                   F.binary_cross_entropy(d_t, torch.ones_like(d_t))
        
        # Toplam Kayıp
        loss = loss_task + 0.1 * loss_adv
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if i % 100 == 0:
            print(f"Batch {i} | Loss: {loss.item():.4f}")
            
    return total_loss / (i + 1)
# Eğitimi Başlat
print(f"Eğitim {device} üzerinde başlıyor...")
for epoch in range(1, 11):
    loss = train()
    print(f"Epoch: {epoch:02d}, Loss: {loss:.4f}")

Eğitim cuda üzerinde başlıyor...
Batch 0 | Loss: 20.1289
Batch 100 | Loss: 1.3885
Batch 200 | Loss: 1.1952
Batch 300 | Loss: 1.0148
Batch 400 | Loss: 1.0395
Batch 500 | Loss: 1.0120
Batch 600 | Loss: 1.0073
Batch 700 | Loss: 0.9665
Batch 800 | Loss: 1.0384
Batch 900 | Loss: 0.9626
Batch 1000 | Loss: 1.0171
Batch 1100 | Loss: 0.9523
Batch 1200 | Loss: 1.0086
Batch 1300 | Loss: 0.9308
Batch 1400 | Loss: 0.9593
Batch 1500 | Loss: 1.0512
Batch 1600 | Loss: 0.8925
Batch 1700 | Loss: 0.9173
Batch 1800 | Loss: 1.0056
Batch 1900 | Loss: 0.9695
Batch 2000 | Loss: 1.0445
Epoch: 01, Loss: 1.2079
Batch 0 | Loss: 1.0094
Batch 100 | Loss: 1.0289
Batch 200 | Loss: 1.0309
Batch 300 | Loss: 1.0079
Batch 400 | Loss: 1.0214
Batch 500 | Loss: 1.1097
Batch 600 | Loss: 0.9554
Batch 700 | Loss: 1.1759
Batch 800 | Loss: 1.0533
Batch 900 | Loss: 1.0367
Batch 1000 | Loss: 1.0112
Batch 1100 | Loss: 1.0153
Batch 1200 | Loss: 1.0820
Batch 1300 | Loss: 1.0622
Batch 1400 | Loss: 1.0280
Batch 1500 | Loss: 1.0956
Batc

In [12]:
def evaluate_model(model, loader, device, max_batches=500):
    model.eval()
    total_mse = 0
    total_samples = 0
    
    with torch.no_grad():
        for i, batch in enumerate(loader):
            if i >= max_batches: # Belirli bir batch'ten sonra dur
                break
                
            batch = batch.to(device)
            z_dict = model.encoder(batch.x_dict, batch.edge_index_dict)
            
            edge_index = batch['user', 'interacts', 'item'].edge_label_index
            labels = batch['user', 'interacts', 'item'].edge_label
            
            preds = model.predict(z_dict['user'], z_dict['item'], edge_index)
            
            total_mse += F.mse_loss(preds, labels, reduction='sum').item()
            total_samples += labels.size(0)
            
    rmse = (total_mse / total_samples) ** 0.5
    return rmse

# Şimdi daha hızlı çalışacaktır:
target_rmse = evaluate_model(model, train_loader_t, device, max_batches=500)
print(f"\nTarget Domain (Electronics) Tahmini RMSE: {target_rmse:.4f}")


Target Domain (Electronics) Tahmini RMSE: 1.1743
