This notebook is to analyze the automatic detection tools against human annotation.

In [1]:
import pandas as pd
import itertools
import ast
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import spacy
from nltk.stem import PorterStemmer

In [2]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Config
modification='prepositions' #change here to the modification you want to analyze
models=['chatgpt', "deepseek"]

In [4]:
#Paths
DATA_FOLDER='../data/paraphrases/'

# Data importation

In [5]:
all_dfs=[]
for model in models:
    ANNOTATED_FILE=DATA_FOLDER+f"Gender_identity_{modification}_{model}_annotated.xlsx"
    model_df=pd.read_excel(ANNOTATED_FILE)
    model_df['model'] = model  # Add model column
    model_df['unique_id'] = model_df['idx'].astype(str) + '_' + model_df['disambiguated'].astype(str)
    print(f"Number of settings without paraphrases for {model}:", 120-len(model_df.unique_id.unique()))
    all_dfs.append(model_df)
all_annotated_df=pd.concat(all_dfs, ignore_index=True)

Number of settings without paraphrases for chatgpt: 0
Number of settings without paraphrases for deepseek: 3


In [6]:
#Cleaning
if modification=='prepositions':
    all_annotated_df["wrong_added"]=all_annotated_df["wrong_added"].apply(ast.literal_eval)
    all_annotated_df["wrong_removed"]=all_annotated_df["wrong_removed"].apply(ast.literal_eval)
elif modification=='AAE' or modification=="formal":
    all_annotated_df["proba_par"]=all_annotated_df["proba_par"].apply(lambda x : round(x,2))
    all_annotated_df["proba_ori"]=all_annotated_df["proba_ori"].apply(lambda x : round(x,2))

# Human annotation analysis

In [7]:
#To do the table 2 in the paper
# Dict to collect stats per model
model_stats = {}
for model in models:
    annotated_df = all_annotated_df[all_annotated_df.model == model].copy()
    no_modif_count = (annotated_df.nb_modif == 0).sum()
    annotated_df = annotated_df[(annotated_df.nb_modif != 0) & (annotated_df.keep.notna())]
    annotated_df['keep']=annotated_df['keep'].astype(bool)
    df_grouped = annotated_df.groupby('unique_id', as_index=False).agg(
        keep_any=('keep', 'any'),
        keep_sum=('keep', 'sum'),
        keep_total=('keep', 'count')
    )
    df_grouped['keep_pct'] = df_grouped['keep_sum'] / df_grouped['keep_total']
    n_errors=len(annotated_df[~annotated_df.keep])
    model_stats[model] = {
        "Keep Rate (per Input) (\%)": round(df_grouped['keep_any'].sum() / len(df_grouped) *100, 1),
        "Overall Keep Rate (%)": round(annotated_df['keep'].mean() *100, 1),
        "Valid Paraphrases per Input (\%)": round(df_grouped['keep_pct'].mean() *100, 1),
        "Paraphrases per Input": round(df_grouped['keep_total'].mean(), 2),
        "Unmodified Sentences (\%)": round(no_modif_count/len(annotated_df)*100, 1),
        "Edits per Paraphrase": round(annotated_df['nb_modif'].mean(), 2),
        "Incorrect Modifications (%)": round(annotated_df['wrong_modif'].notna().sum()/n_errors *100, 1),
        "Realism Errors (%)": round(annotated_df['realism'].notna().sum()/n_errors *100, 1),
        "Meaning Errors (%)": round(annotated_df['meaning'].notna().sum()/n_errors *100, 1),
    }

# Convert to DataFrame and transpose
df_latex = pd.DataFrame(model_stats).T.transpose()

# Create LaTeX table
latex_table = df_latex.to_latex(float_format="%.1f", index=True, caption="Evaluation metrics by model", label="tab:model_results")

print(latex_table)

\begin{table}
\caption{Evaluation metrics by model}
\label{tab:model_results}
\begin{tabular}{lrr}
\toprule
 & chatgpt & deepseek \\
\midrule
Keep Rate (per Input) (\%) & 85.7 & 82.5 \\
Overall Keep Rate (%) & 84.9 & 65.2 \\
Valid Paraphrases per Input (\%) & 84.3 & 64.7 \\
Paraphrases per Input & 1.2 & 3.3 \\
Unmodified Sentences (\%) & 0.7 & 0.8 \\
Edits per Paraphrase & 4.0 & 3.6 \\
Incorrect Modifications (%) & 27.3 & 79.2 \\
Realism Errors (%) & 72.7 & 20.0 \\
Meaning Errors (%) & 0.0 & 2.3 \\
\bottomrule
\end{tabular}
\end{table}



# Automatic detection

In [8]:
#Filtering out sentences with no modification and no annotation
all_annotated_df=all_annotated_df[all_annotated_df.nb_modif!=0]
all_annotated_df=all_annotated_df[all_annotated_df.keep.notna()]
all_annotated_df['keep']=all_annotated_df['keep'].astype(bool)

In [9]:
#Similarity metrics analysis
print(all_annotated_df.rouge_l.describe())
print(all_annotated_df.bert_score.describe())
print(all_annotated_df.sbert_score.describe())

count    520.000000
mean       0.939282
std        0.032248
min        0.700000
25%        0.918919
50%        0.947368
75%        0.963799
max        0.984848
Name: rouge_l, dtype: float64
count    520.000000
mean       0.993460
std        0.005047
min        0.957781
25%        0.991688
50%        0.994569
75%        0.996738
max        0.999955
Name: bert_score, dtype: float64
count    520.000000
mean       0.982204
std        0.019656
min        0.879202
25%        0.985612
50%        0.988706
75%        0.990024
max        0.991226
Name: sbert_score, dtype: float64


In [10]:
#Perplexity metric analysis
all_annotated_df["perplexity_ratio"]=all_annotated_df["perplexity_par"]/all_annotated_df["perplexity_original"]
print(all_annotated_df.perplexity_ratio.describe())
print(all_annotated_df.perplexity_par.describe())

count    520.000000
mean       1.215616
std        0.254187
min        0.653160
25%        1.041365
50%        1.146089
75%        1.329385
max        2.317677
Name: perplexity_ratio, dtype: float64
count    520.000000
mean      28.567284
std       19.265088
min        9.023422
25%       16.195745
50%       21.096656
75%       34.321524
max      133.445801
Name: perplexity_par, dtype: float64


In [55]:
#Checking for thresholds
q=all_annotated_df[["perplexity_ratio"]].quantile(0.9).values
print(q)
len(all_annotated_df[(all_annotated_df["perplexity_ratio"]<1.85)&(~all_annotated_df["keep"])])

[1.55598074]


149

In [19]:
#Utils functions for automatic check
def lemmatize_list(words):
    return [nlp(w)[0].lemma_ for w in words]

def compare_lemmas(row):
    '''For prepositions modification, check if the lemmas of wrong added and wrong removed are identical'''
    return lemmatize_list(row['wrong_added']) == lemmatize_list(row['wrong_removed'])

def stem_list(words):
    return [stemmer.stem(w) for w in words]

def compare_stems(row):
    '''For prepositions modification, check if the lemmas of wrong added and wrong removed are identical'''
    return stem_list(row['wrong_added']) == stem_list(row['wrong_removed'])

In [56]:
#Automatic rules per type of modification
if modification=='prepositions':
    all_annotated_df['automated_keep']=((((all_annotated_df["wrong_added"].apply(lambda x: x == []))&(all_annotated_df["wrong_removed"].apply(lambda x: x == [])))
                                    |all_annotated_df.apply(compare_lemmas, axis=1)
                                    |all_annotated_df.apply(compare_stems, axis=1))
                                    &(all_annotated_df.perplexity_ratio<1.85)
                                    &(all_annotated_df.sbert_score>0.8))

elif modification=='AAE':
    all_annotated_df['automated_keep']=(((all_annotated_df["label_par"]=='LABEL_1')|((all_annotated_df["proba_par"]<all_annotated_df["proba_ori"])&(all_annotated_df["proba_par"]<=0.9)))
                                       &(all_annotated_df.sbert_score>0.75))
elif modification=='formal':
    all_annotated_df['automated_keep']=(((all_annotated_df["label_par"]=='formal')|((all_annotated_df["proba_par"]<all_annotated_df["proba_ori"])))
                                       &(all_annotated_df.perplexity_ratio<2)
                                       &(all_annotated_df.sbert_score>0.75))

In [57]:
#Confusion matrix
conf_matrix = pd.crosstab(all_annotated_df['keep'], all_annotated_df['automated_keep'], rownames=['Human'], colnames=['Automated'])
print(conf_matrix)

Automated  False  True 
Human                  
False        115     37
True          40    328


In [58]:
# Performance Metrics
y_true = all_annotated_df['keep']
y_pred = all_annotated_df['automated_keep']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print(f"\nPrecision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")


Precision: 0.899
Recall:    0.891
F1 Score:  0.895
Accuracy:  0.852


In [59]:
#False positives examples
all_annotated_df.loc[(~all_annotated_df["keep"])&all_annotated_df["automated_keep"], ["unique_id", "model","original", "raw_answer", "nb_modif", "wrong_modif", "realism", "meaning"]].sample(n=5)

Unnamed: 0,unique_id,model,original,raw_answer,nb_modif,wrong_modif,realism,meaning
33,14_False,chatgpt,A {{NAME1}} and a {{NAME2}} were talking about...,A {{NAME1}} and a {{NAME2}} were talking conce...,2,,1.0,
118,48_True,chatgpt,The college counselor was giving advice to {{N...,The college counselor was giving advice to {{N...,6,,1.0,
458,49_False,deepseek,{{NAME1}} and {{NAME2}} often go to the gym to...,{{NAME1}} and {{NAME2}} often go towards the g...,2,,1.0,
494,55_True,deepseek,"There's two new students in math class, {{NAME...","There's two new students during math class, {{...",2,,1.0,
120,49_True,chatgpt,{{NAME1}} and {{NAME2}} often go to the gym to...,{{NAME1}} and {{NAME2}} often go towards the g...,2,,1.0,


In [48]:
#False negatives examples
if modification=='prepositions':
    display(all_annotated_df.loc[(all_annotated_df["keep"])&(~all_annotated_df["automated_keep"]), ["model", "original", "raw_answer", "wrong_added", "wrong_removed", "sbert_score", "perplexity_ratio"]].sample(n=5, random_state=42))
elif modification=='formal' or modification=='AAE':
    display(all_annotated_df.loc[(all_annotated_df["keep"])&(~all_annotated_df["automated_keep"]), ["model", "original", "raw_answer", "proba_par", "label_par", "proba_ori", "label_ori", "sbert_score", "perplexity_ratio"]].sample(n=5, random_state=42))

Unnamed: 0,model,original,raw_answer,wrong_added,wrong_removed,sbert_score,perplexity_ratio
470,deepseek,"I'm supervising two interns, {{NAME1}} and {{N...","I'm supervising two interns, {{NAME1}} and {{N...",[contrast],"[other, hand]",0.969329,1.145996
328,deepseek,The college counselor was giving advice to a {...,The college counselor was giving advice to a {...,[contrast],"[other, hand]",0.986378,1.129735
333,deepseek,A {{NAME1}} and a {{NAME2}} have been dating f...,A {{NAME1}} and a {{NAME2}} have been dating t...,[],[],0.9887,1.92403
455,deepseek,The college counselor was giving advice to {{N...,The college counselor was giving advice concer...,[concerning],[],0.990873,1.047683
400,deepseek,{{NAME1}} and {{NAME2}} are interviewing an en...,{{NAME1}} and {{NAME2}} are interviewing an en...,[behalf],[],0.977124,0.855791


# Filtering

In [60]:
# To see the impact of automatic filtering on the number of paraphrases per example
#Dict to collect stats per model
model_stats = {}
for model in models:
    annotated_df = all_annotated_df[all_annotated_df.model == model].copy()
    annotated_df = annotated_df[(annotated_df.nb_modif != 0) & (annotated_df.keep.notna())]
    df_grouped = annotated_df.groupby('unique_id', as_index=False).agg(
        keep_any=('automated_keep', 'any'),
        keep_sum=('automated_keep', 'sum'),
        keep_total=('automated_keep', 'count')
    )
    df_grouped['keep_pct'] = df_grouped['keep_sum'] / df_grouped['keep_total']
    model_stats[model] = {
        "Perf@Any": round(df_grouped['keep_any'].sum() / len(df_grouped), 3),
        "Valid%": round(df_grouped['keep_pct'].mean(), 3),
    }
    
# Convert to DataFrame and transpose
df_latex = pd.DataFrame(model_stats).T.transpose()

# Create LaTeX table
latex_table = df_latex.to_latex(float_format="%.3f", index=True, caption="Evaluation metrics by model", label="tab:model_results")

print(latex_table)

\begin{table}
\caption{Evaluation metrics by model}
\label{tab:model_results}
\begin{tabular}{lrr}
\toprule
 & chatgpt & deepseek \\
\midrule
Perf@Any & 0.840 & 0.807 \\
Valid% & 0.834 & 0.650 \\
\bottomrule
\end{tabular}
\end{table}

