In [4]:
import pandas as pd

# 1) Charge les deux jeux
df_human = pd.read_csv("../data/train_essays.csv")          # label 0
df_ai    = pd.read_csv("../data/generated_ai_essays.csv")   # label 1

# 2) Concatène
df_full = pd.concat([df_human, df_ai], ignore_index=True)

# 3) Nettoie les valeurs manquantes
df_full["text"] = (
    df_full["text"]
        .fillna("")          # NaN  -> ""
        .replace("null", "") # 'null' -> ""
)

print("Total textes :", len(df_full), "| IA :", df_full["generated"].sum())

# 4) Sauvegarde pour le preprocess en CLI
df_full.to_csv("../data/combined_essays.csv", index=False)



Total textes : 1428 | IA : 53


In [5]:
import pandas as pd, numpy as np

# Recharge le CSV combiné
df = pd.read_csv("../data/combined_essays.csv")

# Si tu n’as pas ajouté la colonne text_pp dans le preprocess, fais un clean rapide :
from src.preprocess import clean_text
df["text_pp"] = df["text"].apply(clean_text)

y      = df["generated"].values          # labels 0 / 1
texts  = df["text_pp"].tolist()          # liste de chaînes




In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

EMBEDDINGS = {
    "tfidf":  None,
    "minilm": "sentence-transformers/all-MiniLM-L6-v2",
    #  word2vec / doc2vec / glove seront ajoutés ensuite
}

for name, spec in EMBEDDINGS.items():
    print(f"\n========== {name.upper()} ==========")

    # -------- A. Encodage --------
    if name == "tfidf":
        from joblib import load
        vectorizer = load("../outputs/full/tfidf_vectorizer.pkl")
        X = vectorizer.transform(texts)

    elif name == "minilm":
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(spec)
        X = model.encode(texts, batch_size=128, show_progress_bar=True)

    # (placeholders pour les autres embeddings)
    else:
        print(">>> À implémenter dans la prochaine étape")
        continue

    # -------- B. Split, entraînement, rapport --------
    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

    clf = LogisticRegression(max_iter=10_000, n_jobs=-1)
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_val)

    print(classification_report(y_val, y_pred, digits=3))



              precision    recall  f1-score   support

           0      0.986     1.000     0.993       275
           1      1.000     0.636     0.778        11

    accuracy                          0.986       286
   macro avg      0.993     0.818     0.885       286
weighted avg      0.986     0.986     0.985       286




  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 12/12 [00:13<00:00,  1.11s/it]


              precision    recall  f1-score   support

           0      1.000     1.000     1.000       275
           1      1.000     1.000     1.000        11

    accuracy                          1.000       286
   macro avg      1.000     1.000     1.000       286
weighted avg      1.000     1.000     1.000       286



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

EMBEDDINGS = {
    "tfidf":  None,
    "minilm": "sentence-transformers/all-MiniLM-L6-v2",
    "word2vec": "word2vec",          # entraînement local
    "doc2vec": "doc2vec",            # entraînement local
    "glove":   "glove-wiki-gigaword-100"  # pré-entraîné
}


for name, spec in EMBEDDINGS.items():
    print(f"\n========== {name.upper()} ==========")

    # -------- A. Encodage --------
    if name == "tfidf":
        from joblib import load

        vectorizer = load("../outputs/full/tfidf_vectorizer.pkl")
        X = vectorizer.transform(texts)

    elif name == "minilm":
        from sentence_transformers import SentenceTransformer

        model = SentenceTransformer(spec)
        X = model.encode(texts, batch_size=128, show_progress_bar=True)

    elif name == "word2vec":
        from gensim.models import Word2Vec
        tok_texts = [t.split() for t in texts]          # tokenise simple
        w2v = Word2Vec(sentences=tok_texts,
                   vector_size=100, window=5,
                   min_count=2, workers=4, epochs=20)
        X = np.array([np.mean([w2v.wv[w] for w in sent if w in w2v.wv]
                          or np.zeros(100), axis=0) for sent in tok_texts])

    elif name == "doc2vec":
        from gensim.models.doc2vec import Doc2Vec, TaggedDocument
        tagged = [TaggedDocument(words=t.split(), tags=[i])
              for i, t in enumerate(texts)]
        d2v = Doc2Vec(tagged, vector_size=100, workers=4, epochs=40)
        X = np.array([d2v.infer_vector(t.split()) for t in texts])

    elif name == "glove":
        import gensim.downloader as api
        glove = api.load(spec)                     # word-vectors 100 d
        X = np.array([np.mean([glove[w] for w in t.split() if w in glove]
                          or np.zeros(100), axis=0) for t in texts])


    # (placeholders pour les autres embeddings)
    else:
        print(">>> À implémenter dans la prochaine étape")
        continue

    # -------- B. Split, entraînement, rapport --------
    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

    clf = LogisticRegression(max_iter=10_000, n_jobs=-1)
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_val)

    print(classification_report(y_val, y_pred, digits=3))





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

           0      0.986     1.000     0.993       275
           1      1.000     0.636     0.778        11

    accuracy                          0.986       286
   macro avg      0.993     0.818     0.885       286
weighted avg      0.986     0.986     0.985       286




Batches: 100%|██████████| 12/12 [06:01<00:00, 30.15s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already bee

              precision    recall  f1-score   support

           0      1.000     1.000     1.000       275
           1      1.000     1.000     1.000        11

    accuracy                          1.000       286
   macro avg      1.000     1.000     1.000       286
weighted avg      1.000     1.000     1.000       286




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

           0      0.993     1.000     0.996       275
           1      1.000     0.818     0.900        11

    accuracy                          0.993       286
   macro avg      0.996     0.909     0.948       286
weighted avg      0.993     0.993     0.993       286




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

           0      0.978     0.982     0.980       275
           1      0.500     0.455     0.476        11

    accuracy                          0.962       286
   macro avg      0.739     0.718     0.728       286
weighted avg      0.960     0.962     0.961       286




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

           0      0.993     1.000     0.996       275
           1      1.000     0.818     0.900        11

    accuracy                          0.993       286
   macro avg      0.996     0.909     0.948       286
weighted avg      0.993     0.993     0.993       286



In [11]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# -- Encode MiniLM (réutilise ton 'model' déjà chargé) --
X_minilm = model.encode(texts, batch_size=128, show_progress_bar=True)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_minilm, y, test_size=0.20, random_state=42, stratify=y
)

svm = LinearSVC(class_weight="balanced")
svm.fit(X_tr, y_tr)
y_pred = svm.predict(X_val)

print(classification_report(y_val, y_pred, digits=3))


Batches: 100%|██████████| 12/12 [32:11<00:00, 160.98s/it]

              precision    recall  f1-score   support

           0      1.000     1.000     1.000       275
           1      1.000     1.000     1.000        11

    accuracy                          1.000       286
   macro avg      1.000     1.000     1.000       286
weighted avg      1.000     1.000     1.000       286




