In [1]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import pandas as pd 

In [2]:
# for each subject, estimate the distribution of human-written text and AI-generated text
for name in ["CS","EESS","Math","Phys","Stat"]:
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

In [2]:
pd.read_parquet("data/training_data/CS/human_data.parquet")

Unnamed: 0,human_sentence,sentence
0,"[particularly the, the proposed, proposed spre...",particularly the proposed spreading curve view...
1,"[this mixed, mixed approach, approach has, has...",this mixed approach has rarely been applied in...
2,"[firstly observing, observing the, the limited...",firstly observing the limited rotation invaria...
3,"[however existing, existing methods, methods e...",however existing methods either require expens...
4,"[in this, this task, task a, a fused, fused im...",in this task a fused image containing both the...
...,...,...
37855,"[active learning, learning shows, shows promis...",active learning shows promise to decrease test...
37856,"[finally simulations, simulations using, using...",finally simulations using flashflow for load b...
37857,"[in this, this paper, paper we, we model, mode...",in this paper we model this intention as a lat...
37858,"[these results, results help, help to, to iden...",these results help to identify the most critic...


In [4]:
pd.read_parquet("data/validation_data/CS/ground_truth_alpha_0.1.parquet")

Unnamed: 0,inference_sentence,sentence
0,"[however empirical, empirical studies, studies...",however empirical studies show that linearizin...
1,"[inspired by, by our, our work, work on, on de...",inspired by our work on deep multimodal learni...
2,"[however if, if attacked, attacked it, it coul...",however if attacked it could still be compromi...
3,"[existing methods, methods for, for speaker, s...",existing methods for speaker age estimation us...
4,"[estimation of, of the, the heart, heart rate,...",estimation of the heart rate from facial video...
...,...,...
29995,"[while achieving, achieving over, over times, ...",while achieving over times inference speedup
29996,"[moreover the, the research, research provides...",moreover the research provides theoretical evi...
29997,"[one approach, approach to, to enhance, enhanc...",one approach to enhance secrecy is by studying...
29998,"[exploiting internal, internal spatial, spatia...",exploiting internal spatial geometric constrai...


In [6]:
# for each subject, estimate the alpha value of mixed text and calculate the error
for name in ["CS","EESS","Math","Phys","Stat"]:
    # load the framework
    print(name + " Results:")
    model=MLE(f"distribution/{name}.parquet")
    for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
        estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
        error=abs(estimated-alpha)
        print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
        print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
    print("=====================================")

CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.037,     0.003,     0.037
Ground Truth,Prediction,        CI,     Error
     0.025,     0.066,     0.003,     0.041
Ground Truth,Prediction,        CI,     Error
     0.050,     0.091,     0.004,     0.041
Ground Truth,Prediction,        CI,     Error
     0.075,     0.113,     0.004,     0.038
Ground Truth,Prediction,        CI,     Error
     0.100,     0.140,     0.005,     0.040
Ground Truth,Prediction,        CI,     Error
     0.125,     0.162,     0.005,     0.037
Ground Truth,Prediction,        CI,     Error
     0.150,     0.182,     0.005,     0.032
Ground Truth,Prediction,        CI,     Error
     0.175,     0.206,     0.005,     0.031
Ground Truth,Prediction,        CI,     Error
     0.200,     0.231,     0.005,     0.031
Ground Truth,Prediction,        CI,     Error
     0.225,     0.254,     0.006,     0.029
Ground Truth,Prediction,        CI,     Error
     0.250,     0.278,     0.006,     0.02

In [3]:
# for each subject, estimate the alpha value of mixed text and calculate the error with only a specifc wordphrase vocabulary 

# Open the file in write mode with 
for part_of_speech in ['adj', 'verb', 'adv']:
    with open(f"results_{part_of_speech}.txt", "w") as file:
        print(f"Results for vocabulary with just {part_of_speech}\n\n")
        file.write(f"Results for vocabulary with just {part_of_speech}\n\n")
        for name in ["CS", "EESS", "Math", "Phys", "Stat"]:
        # Load the framework
            print(name + " Results:")
            file.write(f"{name} Results:\n")
            model = MLE(f"distribution/{name}.parquet", pos=part_of_speech)
            for alpha in [0, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]:
                estimated, ci = model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
                error = abs(estimated - alpha)
                line_header = f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}"
                line_data = f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}"
                print(line_header)
                print(line_data)
                file.write(line_header + "\n")
                file.write(line_data + "\n")
            print("=====================================")
            file.write("=====================================\n")

Results for vocabulary with just adj


CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.016,     0.006,     0.016
Ground Truth,Prediction,        CI,     Error
     0.025,     0.044,     0.007,     0.019
Ground Truth,Prediction,        CI,     Error
     0.050,     0.062,     0.008,     0.012
Ground Truth,Prediction,        CI,     Error
     0.075,     0.082,     0.008,     0.007
Ground Truth,Prediction,        CI,     Error
     0.100,     0.112,     0.009,     0.012
Ground Truth,Prediction,        CI,     Error
     0.125,     0.128,     0.010,     0.003
Ground Truth,Prediction,        CI,     Error
     0.150,     0.149,     0.010,     0.001
Ground Truth,Prediction,        CI,     Error
     0.175,     0.174,     0.011,     0.001
Ground Truth,Prediction,        CI,     Error
     0.200,     0.194,     0.011,     0.006
Ground Truth,Prediction,        CI,     Error
     0.225,     0.222,     0.011,     0.003
Ground Truth,Prediction,        CI,     Error
   