In [1]:
# =========================
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer
import numpy as np
import joblib

In [2]:
# 1) Charger les données d'origine
df = pd.read_csv(r'c:\Users\Abdoul\Downloads\dataset.csv', sep=',')
df = df[:10000]
# Attendu: colonnes -> id, humain_text, ai_text, instruction
# On gère les NaN pour éviter les plantages sur la vectorisation
for col in ["human_text", "ai_text", "instructions"]:
    if col in df.columns:
        df[col] = df[col].fillna("")
    else:
        raise ValueError(f"Colonne manquante: {col}")


ParserError: Error tokenizing data. C error: Expected 1 fields in line 18, saw 8


In [None]:

# 2) Restructurer en long format avec un label binaire
human_df = pd.DataFrame({
    "text": df["human_text"],
    "instructions": df["instructions"],
    "label": 0  # 0 = humain
})
ai_df = pd.DataFrame({
    "text": df["ai_text"],
    "instructions": df["instructions"],
    "label": 1  # 1 = IA
})
data = pd.concat([human_df, ai_df], ignore_index=True)


In [None]:

# 3) Split
X = data[["text", "instructions"]]
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

In [None]:
# --- Split: train/val/test ---
# Tu as déjà fait un split train/test. On crée maintenant un split validation à partir de train.
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=0
)

In [None]:
print(f"Train size: {X_tr.shape}, Test size: {X_test.shape},Train size: {X_val.shape},Train size: {y_tr.shape}, Test size: {y_test.shape}, Test size: {y_val.shape}")

Train size: (12800, 2), Test size: (4000, 2),Train size: (3200, 2),Train size: (12800,), Test size: (4000,), Test size: (3200,)


In [None]:
# 4) Prétraitement + Modèle (Pipeline)
# Deux TF-IDF: un pour le texte principal, un pour l'instruction (souvent utile)
preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english"), "text"),
        ("instr", TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english"), "instructions"),
    ],
    remainder="drop"
)

# Modèle simple, robuste et rapide pour texte: régression logistique
clf = LogisticRegression(max_iter=1000)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", clf)
])

In [None]:
# --- Entraînement sur train uniquement ---
pipe.fit(X_tr, y_tr)

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('instr', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [None]:
# --- Évaluation sur validation ---
y_val_pred = pipe.predict(X_val)
print("\n=== Validation ===")
print("Accuracy (val):", accuracy_score(y_val, y_val_pred))
print("F1 weighted (val):", f1_score(y_val, y_val_pred, average="weighted"))
print(classification_report(y_val, y_val_pred, digits=3, target_names=["humain","IA"]))


=== Validation ===
Accuracy (val): 0.9990625
F1 weighted (val): 0.9990624999084471
              precision    recall  f1-score   support

      humain      0.999     0.999     0.999      1600
          IA      0.999     0.999     0.999      1600

    accuracy                          0.999      3200
   macro avg      0.999     0.999     0.999      3200
weighted avg      0.999     0.999     0.999      3200



In [None]:
# --- Réentraînement sur train+val (optionnel mais recommandé avant test) ---
pipe.fit(pd.concat([X_tr, X_val]), pd.concat([y_tr, y_val]))

0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('instr', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [None]:
# --- Évaluation finale sur test (jamais utilisé pour l’entraînement) ---
y_test_pred = pipe.predict(X_test)
print("\n=== Test ===")
print("Accuracy (test):", accuracy_score(y_test, y_test_pred))
print("F1 weighted (test):", f1_score(y_test, y_test_pred, average="weighted"))
print(classification_report(y_test, y_test_pred, digits=3, target_names=["humain","IA"]))


=== Test ===
Accuracy (test): 0.9985
F1 weighted (test): 0.9985
              precision    recall  f1-score   support

      humain      0.999     0.999     0.999      2000
          IA      0.999     0.999     0.999      2000

    accuracy                          0.999      4000
   macro avg      0.999     0.999     0.999      4000
weighted avg      0.999     0.999     0.999      4000



In [None]:
joblib.dump(pipe, "model.joblib")
print("Modèle sauvegardé dans model.joblib")

Modèle sauvegardé dans model.joblib
