In [9]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [10]:
MODEL_NAME = "tdrenis/finetuned-bot-detector"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [15]:
with open("C:/Users/Bence/Documents/GitHub/Social-media-filter/data/comments.json", "r", encoding="utf-8") as f:
    comments = json.load(f)
inputs = tokenizer(comments, padding=True, truncation=True, return_tensors='pt')

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

In [None]:
labels = ["human", "bot"]  # Az osztályok, lehet, hogy módosítani kell a modell alapján
results = [{"comment": comment, "prediction": labels[pred.item()]} for comment, pred in zip(comments, predictions)]


In [None]:
with open("bot_detection_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

In [5]:
import torch
MODEL_NAME = "tdrenis/finetuned-bot-detector"

# Ellenőrizd, hogy van-e CUDA (GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modell és tokenizer betöltése
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)  # <- GPU-ra helyezi a modellt

# Szöveg előfeldolgozás és predikció
inputs = tokenizer("this message is generated by an ai bot", return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}  # <- Bemenetek GPU-ra helyezése

with torch.no_grad():
    outputs = model(**inputs)

tensor([[-0.0994, -0.0423]])


In [None]:
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords

def split_sentences(text: str):
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sents if s]

def tokenize(text: str):
    return re.findall(r"\b\w+\b", text.lower(), flags=re.UNICODE)

def ngram_counts(tokens, n=3):
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))

def burstiness(sentences):
    lens = [len(tokenize(s)) for s in sentences]
    if not lens:
        return 0.0
    mean = sum(lens)/len(lens)
    if mean == 0:
        return 0.0
    var = sum((x-mean)**2 for x in lens)/len(lens)
    std = var**0.5
    return std/mean

def type_token_ratio(tokens):
    return len(set(tokens)) / max(1, len(tokens))

def stopword_ratio(tokens, lang='hungarian'):
    try:
        sw = set(stopwords.words(lang))
    except LookupError:
        sw = set()
    if not tokens:
        return 0.0
    return sum(1 for t in tokens if t in sw) / len(tokens)

def punctuation_ratio(text):
    punct = re.findall(r"[^\w\s]", text, flags=re.UNICODE)
    return len(punct) / max(1, len(text))

CUES_HU = [
    "mint mesterséges intelligencia",
    "mint nyelvi modell",
    "nincs hozzáférésem",
    "nem tudok valós időben böngészni",
    "nem vagyok jogi tanácsadó",
    "nem tudok személyes véleményt adni",
]

TRANSITIONS_HU = ["azonban", "ugyanakkor", "továbbá", "összességében", "mindazonáltal", "emellett"]

def transition_ratio(tokens):
    trans = set(TRANSITIONS_HU)
    if not tokens:
        return 0.0
    return sum(1 for t in tokens if t in trans) / len(tokens)

def analyze(text: str):
    sents = split_sentences(text)
    tokens = tokenize(text)
    ttr = type_token_ratio(tokens)
    br = burstiness(sents)
    swr = stopword_ratio(tokens)
    pr = punctuation_ratio(text)
    tr = transition_ratio(tokens)
    ng3 = ngram_counts(tokens, 3)
    max3 = max(ng3.values()) if ng3 else 0
    cliche = any(phrase in text.lower() for phrase in CUES_HU)
    return {
        "length_chars": len(text),
        "length_tokens": len(tokens),
        "avg_sent_len": sum(len(tokenize(s)) for s in sents) / max(1, len(sents)),
        "burstiness": br,
        "ttr": ttr,
        "stopword_ratio": swr,
        "punct_ratio": pr,
        "transition_ratio": tr,
        "max_3gram_freq": max3,
        "ai_template_phrase": cliche,
    }

def rule_based_decision(metrics: dict):
    score = 0
    reasons = []
    if metrics["ai_template_phrase"]:
        reasons.append("AI-sablon kifejezés")
        score += 2
    if metrics["length_tokens"] >= 150 and metrics["burstiness"] <= 0.35:
        reasons.append("Alacsony burstiness hosszú szövegnél")
        score += 1
    if metrics["max_3gram_freq"] >= 4:
        reasons.append("Gyakori 3‑gram ismétlődés")
        score += 1
    if metrics["ttr"] <= 0.35:
        reasons.append("Alacsony TTR")
        score += 1
    if metrics["transition_ratio"] >= 0.05:
        reasons.append("Sok kötő/frázis szó")
        score += 1
    label = "valószínű AI/bot" if score >= 3 else "inkább emberi vagy bizonytalan"
    return label, reasons, score

if __name__ == "__main__":
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    txt = "Ide illeszd a vizsgálandó szöveget."
    m = analyze(txt)
    label, reasons, score = rule_based_decision(m)
    print(m)
    print(label, reasons, score)
