In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings("ignore")


In [2]:
questions = pd.read_csv("../data/Questions.csv", encoding="ISO-8859-1")
tags = pd.read_csv("../data/Tags.csv", encoding="ISO-8859-1")

tag_groups = tags.groupby("Id")["Tag"].apply(list).reset_index()
df = questions.merge(tag_groups, on="Id")
df["text"] = df["Title"].fillna('') + " " + df["Body"].fillna('')
def preprocess(text):
    text = BeautifulSoup(text, "lxml").get_text()
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

df["text"] = df["text"].apply(preprocess)


In [None]:

def fix_tags(taglist):
    if isinstance(taglist, list):
        return [str(tag) for tag in taglist if pd.notna(tag)]
    return []

df["Tag"] = df["Tag"].apply(fix_tags)


df = df[df["Tag"].map(lambda x: len(x) > 0)]

df["Tag"] = df["Tag"].apply(lambda x: x if isinstance(x, list) else [])

df = df[df["Tag"].map(lambda x: isinstance(x, list) and len(x) > 1)]



In [18]:
from collections import Counter


all_tags = [tag for tag_list in df["Tag"] for tag in tag_list]
tag_counts = Counter(all_tags)


valid_tags = [tag for tag, count in tag_counts.items() if count >= 2]


df["Tag"] = df["Tag"].apply(lambda tags: [tag for tag in tags if tag in valid_tags])
df = df[df["Tag"].map(len) > 0]


In [None]:

df["num_labels"] = df["Tag"].apply(len)


df_multi = df[df["num_labels"] >= 2].reset_index(drop=True)


if len(df_multi) >= 2000:
    sampled_df = df_multi.sample(n=2000, random_state=42)
else:
    sampled_df = df_multi.copy()
    remaining = 2000 - len(df_multi)
    df_rest = df[df["num_labels"] < 2].drop(sampled_df.index, errors="ignore")
    sampled_df = pd.concat([sampled_df, df_rest.sample(n=remaining, random_state=42)])


y_sampled = mlb.transform(sampled_df["Tag"])
X_sampled = sampled_df["text"].reset_index(drop=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score


vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X_sampled)


model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_tfidf, y_sampled)


In [23]:
y_pred = model.predict(X_tfidf)
print(classification_report(y_sampled, y_pred, target_names=mlb.classes_))


                    precision    recall  f1-score   support

         .htaccess       0.00      0.00      0.00        10
              .net       0.00      0.00      0.00        76
              ajax       1.00      0.08      0.14        53
         algorithm       0.00      0.00      0.00        16
           android       1.00      0.14      0.25       126
         angularjs       0.00      0.00      0.00        45
            apache       0.00      0.00      0.00        18
               api       0.00      0.00      0.00        10
            arrays       1.00      0.07      0.13        70
           asp.net       0.00      0.00      0.00        90
       asp.net-mvc       0.00      0.00      0.00        42
              bash       0.00      0.00      0.00        18
                 c       0.00      0.00      0.00        41
                c#       0.94      0.06      0.11       258
               c++       1.00      0.01      0.02        84
             class       0.00      0.00

In [None]:
import joblib


joblib.dump(vectorizer, "../models/tfidfl_vectorizer.pkl")

joblib.dump(model, "../models/onevsrestl_logreg.pkl")


joblib.dump(mlb, "../models/multilabell_binarizer.pkl")

print("Model, vectorizer ve encoder başarıyla kaydedildi.")


Model, vectorizer ve encoder başarıyla kaydedildi.
