In [1]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import pandas as pd 

In [2]:
# for each subject, estimate the distribution of human-written text and AI-generated text
for name in ["CS","EESS","Math","Phys","Stat"]:
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

In [3]:
pd.read_parquet("data/training_data/CS/human_data.parquet")

Unnamed: 0,human_sentence,sentence
0,"[particularly the proposed, the proposed sprea...",particularly the proposed spreading curve view...
1,"[this mixed approach, mixed approach has, appr...",this mixed approach has rarely been applied in...
2,"[firstly observing the, observing the limited,...",firstly observing the limited rotation invaria...
3,"[however existing methods, existing methods ei...",however existing methods either require expens...
4,"[in this task, this task a, task a fused, a fu...",in this task a fused image containing both the...
...,...,...
37855,"[active learning shows, learning shows promise...",active learning shows promise to decrease test...
37856,"[finally simulations using, simulations using ...",finally simulations using flashflow for load b...
37857,"[in this paper, this paper we, paper we model,...",in this paper we model this intention as a lat...
37858,"[these results help, results help to, help to ...",these results help to identify the most critic...


In [4]:
pd.read_parquet("data/validation_data/CS/ground_truth_alpha_0.1.parquet")

Unnamed: 0,inference_sentence,sentence
0,"[however empirical studies, empirical studies ...",however empirical studies show that linearizin...
1,"[inspired by our, by our work, our work on, wo...",inspired by our work on deep multimodal learni...
2,"[however if attacked, if attacked it, attacked...",however if attacked it could still be compromi...
3,"[existing methods for, methods for speaker, fo...",existing methods for speaker age estimation us...
4,"[estimation of the, of the heart, the heart ra...",estimation of the heart rate from facial video...
...,...,...
29995,"[while achieving over, achieving over times, o...",while achieving over times inference speedup
29996,"[moreover the research, the research provides,...",moreover the research provides theoretical evi...
29997,"[one approach to, approach to enhance, to enha...",one approach to enhance secrecy is by studying...
29998,"[exploiting internal spatial, internal spatial...",exploiting internal spatial geometric constrai...


In [5]:
# for each subject, estimate the alpha value of mixed text and calculate the error
for name in ["CS","EESS","Math","Phys","Stat"]:
    # load the framework
    print(name + " Results:")
    model=MLE(f"distribution/{name}.parquet")
    for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
        estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
        error=abs(estimated-alpha)
        print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
        print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
    print("=====================================")

CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.049,     0.005,     0.049
Ground Truth,Prediction,        CI,     Error
     0.025,     0.075,     0.005,     0.050
Ground Truth,Prediction,        CI,     Error
     0.050,     0.099,     0.006,     0.049
Ground Truth,Prediction,        CI,     Error
     0.075,     0.126,     0.006,     0.051
Ground Truth,Prediction,        CI,     Error
     0.100,     0.152,     0.006,     0.052
Ground Truth,Prediction,        CI,     Error
     0.125,     0.175,     0.007,     0.050
Ground Truth,Prediction,        CI,     Error
     0.150,     0.197,     0.007,     0.047
Ground Truth,Prediction,        CI,     Error
     0.175,     0.217,     0.007,     0.042
Ground Truth,Prediction,        CI,     Error
     0.200,     0.245,     0.008,     0.045
Ground Truth,Prediction,        CI,     Error
     0.225,     0.274,     0.007,     0.049
Ground Truth,Prediction,        CI,     Error
     0.250,     0.296,     0.008,     0.04

In [3]:
# for each subject, estimate the alpha value of mixed text and calculate the error with only a specifc wordphrase vocabulary 

# Open the file in write mode with 
for part_of_speech in ['adj', 'verb', 'adv']:
    with open(f"results_{part_of_speech}.txt", "w") as file:
        print(f"Results for vocabulary with just {part_of_speech}\n\n")
        file.write(f"Results for vocabulary with just {part_of_speech}\n\n")
        for name in ["CS", "EESS", "Math", "Phys", "Stat"]:
        # Load the framework
            print(name + " Results:")
            file.write(f"{name} Results:\n")
            model = MLE(f"distribution/{name}.parquet", pos=part_of_speech)
            for alpha in [0, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]:
                estimated, ci = model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
                error = abs(estimated - alpha)
                line_header = f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}"
                line_data = f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}"
                print(line_header)
                print(line_data)
                file.write(line_header + "\n")
                file.write(line_data + "\n")
            print("=====================================")
            file.write("=====================================\n")

Results for vocabulary with just adj


CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.019,     0.008,     0.019
Ground Truth,Prediction,        CI,     Error
     0.025,     0.045,     0.010,     0.020
Ground Truth,Prediction,        CI,     Error
     0.050,     0.057,     0.011,     0.007
Ground Truth,Prediction,        CI,     Error
     0.075,     0.078,     0.011,     0.003
Ground Truth,Prediction,        CI,     Error
     0.100,     0.115,     0.012,     0.015
Ground Truth,Prediction,        CI,     Error
     0.125,     0.122,     0.013,     0.003
Ground Truth,Prediction,        CI,     Error
     0.150,     0.159,     0.015,     0.009
Ground Truth,Prediction,        CI,     Error
     0.175,     0.173,     0.015,     0.002
Ground Truth,Prediction,        CI,     Error
     0.200,     0.181,     0.015,     0.019
Ground Truth,Prediction,        CI,     Error
     0.225,     0.227,     0.017,     0.002
Ground Truth,Prediction,        CI,     Error
   