This notebook is to analyze the automatic detection tools against human annotation.

In [42]:
import pandas as pd
import itertools
import ast
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import spacy
from nltk.stem import PorterStemmer

In [43]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

In [59]:
#Config
modification='formal' #change here to the modification you want to analyze
models=['chatgpt', "deepseek"]

In [60]:
#Paths
DATA_FOLDER=f'../data/paraphrases/{modification}/'

# Data importation

In [61]:
all_dfs=[]
for model in models:
    ANNOTATED_FILE=DATA_FOLDER+f"Gender_identity_{modification}_{model}_annotated.xlsx"
    model_df=pd.read_excel(ANNOTATED_FILE)
    model_df['model'] = model  # Add model column
    model_df['unique_id'] = model_df['idx'].astype(str) + '_' + model_df['disambiguated'].astype(str)
    print(f"Number of settings without paraphrases for {model}:", 120-len(model_df.unique_id.unique()))
    all_dfs.append(model_df)
all_annotated_df=pd.concat(all_dfs, ignore_index=True)

Number of settings without paraphrases for chatgpt: 0
Number of settings without paraphrases for deepseek: 0


In [62]:
#Cleaning
if modification=='prepositions':
    all_annotated_df["wrong_added"]=all_annotated_df["wrong_added"].apply(ast.literal_eval)
    all_annotated_df["wrong_removed"]=all_annotated_df["wrong_removed"].apply(ast.literal_eval)
elif modification=='AAE' or modification=="formal":
    all_annotated_df["proba_par"]=all_annotated_df["proba_par"].apply(lambda x : round(x,2))
    all_annotated_df["proba_ori"]=all_annotated_df["proba_ori"].apply(lambda x : round(x,2))

# Human annotation analysis

In [63]:
#To do the table 2 in the paper
# Dict to collect stats per model
model_stats = {}
for model in models:
    annotated_df = all_annotated_df[all_annotated_df.model == model].copy()
    no_modif_count = (annotated_df.nb_modif == 0).sum()
    annotated_df = annotated_df[(annotated_df.nb_modif != 0) & (annotated_df.keep.notna())]
    annotated_df['keep']=annotated_df['keep'].astype(bool)
    annotated_df['input_len'] = annotated_df['original'].str.split().str.len()
    annotated_df['edits_per_word'] = annotated_df['nb_modif'] / annotated_df['input_len']
    
    df_grouped = annotated_df.groupby('unique_id', as_index=False).agg(
        keep_any=('keep', 'any'),
        keep_sum=('keep', 'sum'),
        keep_total=('keep', 'count')
    )
    df_grouped['keep_pct'] = df_grouped['keep_sum'] / df_grouped['keep_total']
    n_errors=len(annotated_df[~annotated_df.keep])
    model_stats[model] = {
        "Generated Paraphrases per Input": round(df_grouped['keep_total'].mean(), 2),
        "Mean Number of Edits per Word": round(annotated_df['edits_per_word'].mean(), 2),
        "Inputs Without Edits (\%)": round(no_modif_count/len(annotated_df)*100, 1),
        
        "Inputs with At Least One Kept Paraphrase (\%)": round(df_grouped['keep_any'].sum() / len(df_grouped) *100, 1),
        "Global Proportion of Valid Paraphrases (\%) ": round(annotated_df['keep'].mean() *100, 1),
        "Average Valid Paraphrase Ratio per Input (\%)": round(df_grouped['keep_pct'].mean() *100, 1),
        
        "Correctness Errors (\%)": round(annotated_df['wrong_modif'].notna().sum()/n_errors *100, 1),
        "Realism Errors (\%)": round(annotated_df['realism'].notna().sum()/n_errors *100, 1),
        "Meaning Errors (\%)": round(annotated_df['meaning'].notna().sum()/n_errors *100, 1),
    }

# Convert to DataFrame and transpose
df_latex = pd.DataFrame(model_stats).T.transpose()

# Create LaTeX table
latex_table = df_latex.to_latex(float_format="%.1f", index=True, caption=f"Evaluation metrics by model for modification {modification}", label="tab:model_results")

print(latex_table)

\begin{table}
\caption{Evaluation metrics by model for modification formal}
\label{tab:model_results}
\begin{tabular}{lrr}
\toprule
 & chatgpt & deepseek \\
\midrule
Generated Paraphrases per Input & 4.5 & 4.7 \\
Mean Number of Edits per Word & 0.6 & 0.7 \\
Inputs Without Edits (\%) & 0.0 & 0.5 \\
Inputs with At Least One Kept Paraphrase (\%) & 100.0 & 98.3 \\
Global Proportion of Valid Paraphrases (\%)  & 91.9 & 88.7 \\
Average Valid Paraphrase Ratio per Input (\%) & 92.7 & 88.0 \\
Correctness Errors (\%) & 84.1 & 49.2 \\
Realism Errors (\%) & 9.1 & 7.9 \\
Meaning Errors (\%) & 11.4 & 34.9 \\
\bottomrule
\end{tabular}
\end{table}



# Automatic detection

In [64]:
#Filtering out sentences with no modification and no annotation
all_annotated_df=all_annotated_df[all_annotated_df.nb_modif!=0]
all_annotated_df=all_annotated_df[all_annotated_df.keep.notna()]
all_annotated_df['keep']=all_annotated_df['keep'].astype(bool)

In [65]:
#Similarity metrics analysis
print(all_annotated_df.rouge_l.describe())
print(all_annotated_df.bert_score.describe())
print(all_annotated_df.sbert_score.describe())

count    1102.000000
mean        0.691907
std         0.131939
min         0.342857
25%         0.596834
50%         0.701754
75%         0.787879
max         1.000000
Name: rouge_l, dtype: float64
count    1102.000000
mean        0.971726
std         0.013439
min         0.929204
25%         0.963672
50%         0.973598
75%         0.981413
max         0.998459
Name: bert_score, dtype: float64
count    1102.000000
mean        0.931833
std         0.070703
min         0.563168
25%         0.902019
50%         0.960688
75%         0.984177
max         0.991305
Name: sbert_score, dtype: float64


In [66]:
#Perplexity metric analysis
all_annotated_df["perplexity_ratio"]=all_annotated_df["perplexity_par"]/all_annotated_df["perplexity_original"]
print(all_annotated_df.perplexity_ratio.describe())
print(all_annotated_df.perplexity_par.describe())

count    1102.000000
mean        1.248203
std         0.368289
min         0.323196
25%         0.983341
50%         1.190847
75%         1.449893
max         3.037339
Name: perplexity_ratio, dtype: float64
count    1102.000000
mean       27.153876
std        13.799774
min         8.270179
25%        17.032090
50%        23.053087
75%        34.754127
max        93.105911
Name: perplexity_par, dtype: float64


In [96]:
#Checking for thresholds
q=all_annotated_df[["perplexity_ratio"]].quantile(0.95).values
print(q)
len(all_annotated_df[(all_annotated_df["perplexity_ratio"]>1.85)&(all_annotated_df["keep"])])

[1.96477269]


65

In [97]:
q=all_annotated_df[["sbert_score"]].quantile(0.05).values
print(q)
len(all_annotated_df[(all_annotated_df["sbert_score"]<0.8)&(~all_annotated_df["keep"])])

[0.77499473]


9

In [98]:
#Utils functions for automatic check
def lemmatize_list(words):
    return [nlp(w)[0].lemma_ for w in words]

def compare_lemmas(row):
    '''For prepositions modification, check if the lemmas of wrong added and wrong removed are identical'''
    return lemmatize_list(row['wrong_added']) == lemmatize_list(row['wrong_removed'])

def stem_list(words):
    return [stemmer.stem(w) for w in words]

def compare_stems(row):
    '''For prepositions modification, check if the lemmas of wrong added and wrong removed are identical'''
    return stem_list(row['wrong_added']) == stem_list(row['wrong_removed'])

In [101]:
#Automatic rules per type of modification
if modification=='prepositions':
    all_annotated_df['automated_keep']=((((all_annotated_df["wrong_added"].apply(lambda x: x == []))&(all_annotated_df["wrong_removed"].apply(lambda x: x == [])))
                                    |all_annotated_df.apply(compare_lemmas, axis=1)
                                    |all_annotated_df.apply(compare_stems, axis=1))
                                    &(all_annotated_df.perplexity_ratio<1.85)
                                    &(all_annotated_df.sbert_score>0.8))

elif modification=='AAE':
    all_annotated_df['automated_keep']=(((all_annotated_df["label_par"]=='LABEL_1')|((all_annotated_df["proba_par"]<all_annotated_df["proba_ori"])&(all_annotated_df["proba_par"]<=0.9)))
                                       &(all_annotated_df.sbert_score>0.75))
elif modification=='formal':
    all_annotated_df['automated_keep']=(((all_annotated_df["label_par"]=='formal')|((all_annotated_df["proba_par"]<all_annotated_df["proba_ori"])))
                                       &(all_annotated_df.perplexity_ratio<2)
                                       &(all_annotated_df.sbert_score>0.75))

In [102]:
#Confusion matrix
conf_matrix = pd.crosstab(all_annotated_df['keep'], all_annotated_df['automated_keep'], rownames=['Human'], colnames=['Automated'])
print(conf_matrix)

Automated  False  True 
Human                  
False         38     69
True         129    866


In [103]:
# Performance Metrics
y_true = all_annotated_df['keep']
y_pred = all_annotated_df['automated_keep']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print(f"\nPrecision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print(f"Accuracy:  {accuracy:.3f}")


Precision: 0.926
Recall:    0.870
F1 Score:  0.897
Accuracy:  0.820


In [93]:
#False positives examples
all_annotated_df.loc[(~all_annotated_df["keep"])&all_annotated_df["automated_keep"], ["unique_id", "model","original", "raw_answer", "nb_modif", "wrong_modif", "realism", "meaning"]].sample(n=5)

Unnamed: 0,unique_id,model,original,raw_answer,nb_modif,wrong_modif,realism,meaning
641,10_False,deepseek,The family with young kids needed a new babysi...,The family with young children needed a new ba...,4,1.0,,
728,20_False,deepseek,A {{NAME1}} and a {{NAME2}} had both missed a ...,A {{NAME1}} and a {{NAME2}} had both missed a ...,5,1.0,,
787,26_False,deepseek,"A married couple, a {{NAME1}} and a {{NAME2}},...","A married couple, a {{NAME1}} and a {{NAME2}},...",10,1.0,,
278,29_False,chatgpt,The college counselor was giving advice to a {...,The college counselor was advising a {{NAME1}}...,4,1.0,,
754,22_True,deepseek,A {{NAME1}} and a {{NAME2}} are both trying ou...,A {{NAME1}} and a {{NAME2}} are both auditioni...,29,1.0,,


In [48]:
#False negatives examples
if modification=='prepositions':
    display(all_annotated_df.loc[(all_annotated_df["keep"])&(~all_annotated_df["automated_keep"]), ["model", "original", "raw_answer", "wrong_added", "wrong_removed", "sbert_score", "perplexity_ratio"]].sample(n=5, random_state=42))
elif modification=='formal' or modification=='AAE':
    display(all_annotated_df.loc[(all_annotated_df["keep"])&(~all_annotated_df["automated_keep"]), ["model", "original", "raw_answer", "proba_par", "label_par", "proba_ori", "label_ori", "sbert_score", "perplexity_ratio"]].sample(n=5, random_state=42))

Unnamed: 0,model,original,raw_answer,wrong_added,wrong_removed,sbert_score,perplexity_ratio
470,deepseek,"I'm supervising two interns, {{NAME1}} and {{N...","I'm supervising two interns, {{NAME1}} and {{N...",[contrast],"[other, hand]",0.969329,1.145996
328,deepseek,The college counselor was giving advice to a {...,The college counselor was giving advice to a {...,[contrast],"[other, hand]",0.986378,1.129735
333,deepseek,A {{NAME1}} and a {{NAME2}} have been dating f...,A {{NAME1}} and a {{NAME2}} have been dating t...,[],[],0.9887,1.92403
455,deepseek,The college counselor was giving advice to {{N...,The college counselor was giving advice concer...,[concerning],[],0.990873,1.047683
400,deepseek,{{NAME1}} and {{NAME2}} are interviewing an en...,{{NAME1}} and {{NAME2}} are interviewing an en...,[behalf],[],0.977124,0.855791


# Filtering

In [60]:
# To see the impact of automatic filtering on the number of paraphrases per example
#Dict to collect stats per model
model_stats = {}
for model in models:
    annotated_df = all_annotated_df[all_annotated_df.model == model].copy()
    annotated_df = annotated_df[(annotated_df.nb_modif != 0) & (annotated_df.keep.notna())]
    df_grouped = annotated_df.groupby('unique_id', as_index=False).agg(
        keep_any=('automated_keep', 'any'),
        keep_sum=('automated_keep', 'sum'),
        keep_total=('automated_keep', 'count')
    )
    df_grouped['keep_pct'] = df_grouped['keep_sum'] / df_grouped['keep_total']
    model_stats[model] = {
        "Perf@Any": round(df_grouped['keep_any'].sum() / len(df_grouped), 3),
        "Valid%": round(df_grouped['keep_pct'].mean(), 3),
    }
    
# Convert to DataFrame and transpose
df_latex = pd.DataFrame(model_stats).T.transpose()

# Create LaTeX table
latex_table = df_latex.to_latex(float_format="%.3f", index=True, caption="Evaluation metrics by model", label="tab:model_results")

print(latex_table)

\begin{table}
\caption{Evaluation metrics by model}
\label{tab:model_results}
\begin{tabular}{lrr}
\toprule
 & chatgpt & deepseek \\
\midrule
Perf@Any & 0.840 & 0.807 \\
Valid% & 0.834 & 0.650 \\
\bottomrule
\end{tabular}
\end{table}

