In [1]:
import pandas as pd


questions_path = "../data/Questions.csv"
tags_path = "../data/Tags.csv"

questions_df = pd.read_csv(questions_path, encoding="ISO-8859-1")
tags_df = pd.read_csv(tags_path, encoding="ISO-8859-1")


tags_grouped = tags_df.groupby("Id")["Tag"].apply(list).reset_index()


merged_df = pd.merge(questions_df, tags_grouped, on="Id")
merged_df = merged_df[["Title", "Body", "Tag"]]
merged_df.columns = ["title", "body", "tags"]


merged_df["text"] = (merged_df["title"] + " " + merged_df["body"]).astype(str).str.lower()


merged_df.head()


Unnamed: 0,title,body,tags,text
0,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,"[flex, actionscript-3, air]",sqlstatement.execute() - multiple queries in o...
1,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,"[svn, tortoisesvn, branch, branching-and-merging]",good branching and merging tutorials for torto...
2,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net, sitemap]",asp.net site maps <p>has anyone got experience...
3,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"[algorithm, language-agnostic, colors, color-s...",function for creating color wheels <p>this is ...
4,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net, scripting, compiler-construction]",adding scripting functionality to .net applica...


In [None]:
from collections import Counter


all_tags = [tag for tag_list in merged_df["tags"] for tag in tag_list]
tag_counts = Counter(all_tags)


top_tags = [tag for tag, count in tag_counts.most_common(100)]


filtered_df = merged_df.copy()
filtered_df["tags"] = filtered_df["tags"].apply(lambda tag_list: [tag for tag in tag_list if tag in top_tags])


filtered_df = filtered_df[filtered_df["tags"].map(len) > 0].reset_index(drop=True)


print(f"Veri boyutu (filtrelenmiş): {filtered_df.shape}")
filtered_df["tags"].explode().value_counts().head(10)


Veri boyutu (filtrelenmiş): (1057478, 4)


tags
javascript    124155
java          115212
c#            101186
php            98808
android        90659
jquery         78542
python         64601
html           58976
c++            47591
ios            47009
Name: count, dtype: int64

In [21]:
sample_df = filtered_df.sample(n=1000, random_state=42).reset_index(drop=True)


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(sample_df["tags"])

print(f"Output shape: {y.shape}")
print(f"Örnek etiket vektörü: {y[0]}")
print(f"Etiket isimleri: {mlb.classes_[:10]} ...")


Output shape: (1000, 100)
Örnek etiket vektörü: [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Etiket isimleri: ['.htaccess' '.net' 'ajax' 'algorithm' 'android' 'angularjs' 'apache'
 'api' 'arrays' 'asp.net'] ...


In [None]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset


tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")



class StackOverflowDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float)
        }


texts = sample_df["text"].tolist()
dataset = StackOverflowDataset(texts, y, tokenizer)


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset
import torch.nn as nn

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, y, test_size=0.2, random_state=42
)

train_dataset = StackOverflowDataset(train_texts, train_labels, tokenizer)
test_dataset = StackOverflowDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


from transformers import AutoModel





In [25]:
class MiniLMClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(cls_output))
        return logits



In [None]:

num_labels = y.shape[1]
model = MiniLMClassifier(num_labels)

In [None]:
import torch
from torch import optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)



def train_model(model, dataloader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1} Loss: {total_loss/len(dataloader):.4f}")


train_model(model, train_loader, criterion, optimizer, epochs=1)


                                                                    

Epoch 1 Loss: 0.5543




In [None]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

def evaluate_model(model, dataloader, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.sigmoid(outputs).cpu().numpy()
            labels = labels.cpu().numpy()

            preds = (preds > threshold).astype(int)

            all_preds.append(preds)
            all_labels.append(labels)


    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)

    print(f"Micro F1: {f1_score(y_true, y_pred, average='micro'):.4f}")
    print(f"Macro F1: {f1_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Hamming Loss: {hamming_loss(y_true, y_pred):.4f}")
    
    return y_true, y_pred


import numpy as np
y_true, y_pred = evaluate_model(model, test_loader)


Evaluating: 100%|██████████| 13/13 [00:22<00:00,  1.75s/it]

Micro F1: 0.0000
Macro F1: 0.0000
Hamming Loss: 0.0171



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [28]:
np.sum(y_pred)


np.int64(20000)

In [29]:
for t in [0.5, 0.3, 0.2, 0.1, 0.05]:
    print(f"\nThreshold: {t}")
    _, y_pred = evaluate_model(model, test_loader, threshold=t)
    print("1 tahmini sayısı:", np.sum(y_pred))



Threshold: 0.5


Evaluating: 100%|██████████| 13/13 [00:23<00:00,  1.81s/it]


Micro F1: 0.0309
Macro F1: 0.0274
Hamming Loss: 0.5332
1 tahmini sayısı: 10662

Threshold: 0.3


Evaluating: 100%|██████████| 13/13 [00:24<00:00,  1.88s/it]


Micro F1: 0.0336
Macro F1: 0.0326
Hamming Loss: 0.9829
1 tahmini sayısı: 20000

Threshold: 0.2


Evaluating: 100%|██████████| 13/13 [00:24<00:00,  1.85s/it]


Micro F1: 0.0336
Macro F1: 0.0326
Hamming Loss: 0.9829
1 tahmini sayısı: 20000

Threshold: 0.1


Evaluating: 100%|██████████| 13/13 [00:26<00:00,  2.03s/it]


Micro F1: 0.0336
Macro F1: 0.0326
Hamming Loss: 0.9829
1 tahmini sayısı: 20000

Threshold: 0.05


Evaluating: 100%|██████████| 13/13 [00:22<00:00,  1.74s/it]

Micro F1: 0.0336
Macro F1: 0.0326
Hamming Loss: 0.9829
1 tahmini sayısı: 20000



