In [1]:
import re
import math
import random
import joblib
import numpy as np
import statistics as stats
from collections import Counter
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# C·∫•u h√¨nh Regex v√† Stopwords
PUNCT_RE = re.compile(r"[^\w\s]", re.UNICODE)
STOPWORDS = set(["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", 
                 "it", "for", "not", "on", "with", "he", "as", "you", "do", "at"])

def count_syllables(word):
    """ƒê·∫øm √¢m ti·∫øt ƒë∆°n gi·∫£n ƒë·ªÉ t√≠nh ch·ªâ s·ªë Readability"""
    word = word.lower()
    if len(word) <= 3: return 1
    word = re.sub(r'(?:[^laeiouy]es|ed|[^laeiouy]e)$', '', word)
    word = re.sub(r'^y', '', word)
    syllables = len(re.findall(r'[aeiouy]{1,2}', word))
    return max(1, syllables)

def shannon_entropy(text):
    """T√≠nh ƒë·ªô h·ªón lo·∫°n th√¥ng tin (Entropy) c·ªßa k√Ω t·ª±"""
    if not text: return 0
    counts = Counter(text)
    total = len(text)
    return -sum((cnt / total) * math.log2(cnt / total) for cnt in counts.values())

In [3]:
def extract_features_pro(text):
    words = text.split()
    sentences = [s for s in re.split(r"[.!?]", text) if s.strip()]
    if not sentences: sentences = [text]
    
    num_words = len(words)
    num_sentences = len(sentences) # S·ª≠a l·ªói chia cho 0
    avg_sent_len = num_words / max(1, num_sentences)
    
    # Burstiness (ƒê·ªô bi·∫øn thi√™n ƒë·ªô d√†i c√¢u)
    sent_lens = [len(s.split()) for s in sentences]
    burstiness = stats.pstdev(sent_lens) / avg_sent_len if avg_sent_len > 0 else 0
    
    # Readability (Flesch Reading Ease gi·∫£ l·∫≠p)
    num_syllables = sum(count_syllables(w) for w in words)
    flesch_score = 206.835 - 1.015 * avg_sent_len - 84.6 * (num_syllables / max(1, num_words))
    
    # Stopword Ratio (AI th∆∞·ªùng d√πng t·ª∑ l·ªá t·ª´ d·ª´ng r·∫•t chu·∫©n m·ª±c)
    stop_count = sum(1 for w in words if w.lower() in STOPWORDS)
    
    return {
        "len_words": num_words,
        "ttr": len(set(words)) / max(1, num_words),
        "punct_ratio": len(PUNCT_RE.findall(text)) / max(1, len(text)),
        "avg_sent_len": avg_sent_len,
        "burstiness": burstiness,
        "entropy": shannon_entropy(text),        # M·ªõi
        "flesch_score": flesch_score,            # M·ªõi
        "stopword_ratio": stop_count / max(1, num_words) # M·ªõi
    }

In [4]:
class PPLScorer:
    def __init__(self, model_name="distilgpt2"):
        print(f"Loading LM: {model_name}...")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def perplexity(self, text):
        # C·∫Øt ng·∫Øn text xu·ªëng 512 tokens ƒë·ªÉ tr√°nh l·ªói tr√†n b·ªô nh·ªõ
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            loss = self.model(**inputs, labels=inputs["input_ids"]).loss
        return float(torch.exp(loss))

In [5]:
def load_texts(human_file="human.txt", ai_file="ai.txt"):
    texts, labels = [], []
    for f_path, label in [(human_file, 0), (ai_file, 1)]:
        try:
            with open(f_path, encoding="utf-8") as f:
                content = f.read().split("\n\n")
                print(f"Loaded {len(content)} samples from {f_path}")
                for t in content:
                    if len(t.strip()) > 30:
                        texts.append(t.strip())
                        labels.append(label)
        except Exception as e:
            print(f"Error loading {f_path}: {e}")
    return texts, labels

# Th·ª±c thi load data
texts, labels = load_texts()

# Shuffle d·ªØ li·ªáu
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)
texts, labels = list(texts), list(labels)
print(f"üî• T·ªïng c·ªông: {len(texts)} m·∫´u d·ªØ li·ªáu s·∫µn s√†ng.")

Loaded 15395 samples from human.txt
Loaded 15527 samples from ai.txt
üî• T·ªïng c·ªông: 30922 m·∫´u d·ªØ li·ªáu s·∫µn s√†ng.


In [6]:
scorer = PPLScorer()
X_data = []
feat_names = []

print("‚è≥ ƒêang tr√≠ch xu·∫•t features (vui l√≤ng ƒë·ª£i)...")
for t in tqdm(texts):
    f = extract_features_pro(t)
    f["ppl"] = scorer.perplexity(t)
    X_data.append(list(f.values()))
    if not feat_names: feat_names = list(f.keys())

X = np.array(X_data)
y = np.array(labels)
print("‚úÖ ƒê√£ tr√≠ch xu·∫•t xong ƒë·∫∑c tr∆∞ng!")

Loading LM: distilgpt2...
‚è≥ ƒêang tr√≠ch xu·∫•t features (vui l√≤ng ƒë·ª£i)...


  0%|                                                 | 0/30922 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30922/30922 [1:51:55<00:00,  4.60it/s]

‚úÖ ƒê√£ tr√≠ch xu·∫•t xong ƒë·∫∑c tr∆∞ng!





In [7]:
# Chia t·∫≠p train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ƒê·ªãnh nghƒ©a 3 model con
clf1 = LogisticRegression(random_state=1, max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = SVC(probability=True, random_state=1)

# K·∫øt h·ª£p b·∫±ng Soft Voting
voting_clf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3)],
    voting='soft'
)

# Pipeline v·ªõi PowerTransformer (t·ªët h∆°n StandardScaler cho d·ªØ li·ªáu b·ªã l·ªách)
pipe = Pipeline([
    ("scaler", PowerTransformer()), 
    ("voting", voting_clf)
])

print("üöÄ ƒêang hu·∫•n luy·ªán Ensemble Model...")
pipe.fit(X_train, y_train)
print("üèÅ Hu·∫•n luy·ªán ho√†n t·∫•t!")

üöÄ ƒêang hu·∫•n luy·ªán Ensemble Model...
üèÅ Hu·∫•n luy·ªán ho√†n t·∫•t!


In [8]:
print("\n" + "="*40)
print("üèÜ K·∫æT QU·∫¢ ƒê√ÅNH GI√Å )
print("="*40)

y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=4))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

# Cross Validation check
scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# L∆∞u model
joblib.dump({"pipe": pipe, "feat_names": feat_names}, "aidetect_model_pro.joblib")
print(f"\nüíæ ƒê√£ l∆∞u model v√†o: aidetect_model_pro.joblib")


üèÜ K·∫æT QU·∫¢ ƒê√ÅNH GI√Å (PRO VERSION)
              precision    recall  f1-score   support

           0     0.9609    0.9581    0.9595      3079
           1     0.9586    0.9614    0.9600      3106

    accuracy                         0.9597      6185
   macro avg     0.9597    0.9597    0.9597      6185
weighted avg     0.9597    0.9597    0.9597      6185

ROC-AUC Score: 0.9924
Cross-Validation Accuracy (5-fold): 0.9597 (+/- 0.0041)

üíæ ƒê√£ l∆∞u model v√†o: aidetect_model_pro.joblib
