In [1]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import pandas as pd 

In [2]:
# for each subject, estimate the distribution of human-written text and AI-generated text
for name in ["CS","EESS","Math","Phys","Stat"]:
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

In [3]:
pd.read_parquet("data/training_data/CS/human_data.parquet")

Unnamed: 0,human_sentence,sentence
0,"[particularly, the, proposed, spreading, curve...",particularly the proposed spreading curve view...
1,"[this, mixed, approach, has, rarely, been, app...",this mixed approach has rarely been applied in...
2,"[firstly, observing, the, limited, rotation, i...",firstly observing the limited rotation invaria...
3,"[however, existing, methods, either, require, ...",however existing methods either require expens...
4,"[in, this, task, a, fused, image, containing, ...",in this task a fused image containing both the...
...,...,...
37855,"[active, learning, shows, promise, to, decreas...",active learning shows promise to decrease test...
37856,"[finally, simulations, using, flashflow, for, ...",finally simulations using flashflow for load b...
37857,"[in, this, paper, we, model, this, intention, ...",in this paper we model this intention as a lat...
37858,"[these, results, help, to, identify, the, most...",these results help to identify the most critic...


In [4]:
pd.read_parquet("data/validation_data/CS/ground_truth_alpha_0.1.parquet")

Unnamed: 0,inference_sentence,sentence
0,"[however, empirical, studies, show, that, line...",however empirical studies show that linearizin...
1,"[inspired, by, our, work, on, deep, multimodal...",inspired by our work on deep multimodal learni...
2,"[however, if, attacked, it, could, still, be, ...",however if attacked it could still be compromi...
3,"[existing, methods, for, speaker, age, estimat...",existing methods for speaker age estimation us...
4,"[estimation, of, the, heart, rate, from, facia...",estimation of the heart rate from facial video...
...,...,...
29995,"[while, achieving, over, times, inference, spe...",while achieving over times inference speedup
29996,"[moreover, the, research, provides, theoretica...",moreover the research provides theoretical evi...
29997,"[one, approach, to, enhance, secrecy, is, by, ...",one approach to enhance secrecy is by studying...
29998,"[exploiting, internal, spatial, geometric, con...",exploiting internal spatial geometric constrai...


In [5]:
# Open the file in write mode
with open("results.txt", "w") as file:
    # for each subject, estimate the alpha value of mixed text and calculate the error
    for name in ["CS", "EESS", "Math", "Phys", "Stat"]:
        # Load the framework
        print(name + " Results:")
        file.write(f"{name} Results:\n")
        model = MLE(f"distribution/{name}.parquet")
        for alpha in [0, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]:
            estimated, ci = model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
            error = abs(estimated - alpha)
            line_header = f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}"
            line_data = f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}"
            print(line_header)
            print(line_data)
            file.write(line_header + "\n")
            file.write(line_data + "\n")
        print("=====================================")
        file.write("=====================================\n")

CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.046,     0.003,     0.046
Ground Truth,Prediction,        CI,     Error
     0.025,     0.074,     0.003,     0.049
Ground Truth,Prediction,        CI,     Error
     0.050,     0.097,     0.004,     0.047
Ground Truth,Prediction,        CI,     Error
     0.075,     0.117,     0.004,     0.042
Ground Truth,Prediction,        CI,     Error
     0.100,     0.141,     0.004,     0.041
Ground Truth,Prediction,        CI,     Error
     0.125,     0.164,     0.004,     0.039
Ground Truth,Prediction,        CI,     Error
     0.150,     0.185,     0.004,     0.035
Ground Truth,Prediction,        CI,     Error
     0.175,     0.205,     0.005,     0.030
Ground Truth,Prediction,        CI,     Error
     0.200,     0.231,     0.005,     0.031
Ground Truth,Prediction,        CI,     Error
     0.225,     0.251,     0.005,     0.026
Ground Truth,Prediction,        CI,     Error
     0.250,     0.275,     0.005,     0.02

In [5]:
# for each subject, estimate the alpha value of mixed text and calculate the error with only a specifc wordphrase vocabulary 

# Open the file in write mode with 
for part_of_speech in ['adj', 'verb', 'adv']:
    with open(f"results_{part_of_speech}.txt", "w") as file:
        print(f"Results for vocabulary with just {part_of_speech}\n\n")
        file.write(f"Results for vocabulary with just {part_of_speech}\n\n")
        for name in ["CS", "EESS", "Math", "Phys", "Stat"]:
        # Load the framework
            print(name + " Results:")
            file.write(f"{name} Results:\n")
            model = MLE(f"distribution/{name}.parquet", pos=part_of_speech)
            for alpha in [0, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25]:
                estimated, ci = model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
                error = abs(estimated - alpha)
                line_header = f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}"
                line_data = f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}"
                print(line_header)
                print(line_data)
                file.write(line_header + "\n")
                file.write(line_data + "\n")
            print("=====================================")
            file.write("=====================================\n")

Results for vocabulary with just adj


CS Results:
Ground Truth,Prediction,        CI,     Error
     0.000,     0.014,     0.005,     0.014
Ground Truth,Prediction,        CI,     Error
     0.025,     0.031,     0.007,     0.006
Ground Truth,Prediction,        CI,     Error
     0.050,     0.039,     0.007,     0.011
Ground Truth,Prediction,        CI,     Error
     0.075,     0.053,     0.009,     0.022
Ground Truth,Prediction,        CI,     Error
     0.100,     0.073,     0.009,     0.027
Ground Truth,Prediction,        CI,     Error
     0.125,     0.084,     0.009,     0.041
Ground Truth,Prediction,        CI,     Error
     0.150,     0.106,     0.010,     0.044
Ground Truth,Prediction,        CI,     Error
     0.175,     0.122,     0.010,     0.053
Ground Truth,Prediction,        CI,     Error
     0.200,     0.135,     0.011,     0.065
Ground Truth,Prediction,        CI,     Error
     0.225,     0.148,     0.012,     0.077
Ground Truth,Prediction,        CI,     Error
   