In [1]:
# Install required libraries
!pip install --upgrade docx2txt
!pip install transformers==4.23.0

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3959 sha256=ba86cbf52923d57680dd6f77e5baa749cbb49aae66716321e45e8d15724f4010
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8
Collecting transformers==4.23.0
  Downloading transformers-4.23.0-py3-none-any.whl.metadata (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.23.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloadi

In [2]:
import docx2txt
import numpy as np
import torch
import pandas as pd
from google.colab import files
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
# Step #2: Function to upload and read PDF file
def upload_and_read_word(uploaded_file):
  text = docx2txt.process(uploaded_file)
  text = text.replace('\n', '')
  return text

# Step 3: Calculate Perplexity
def calculate_perplexity(text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    tokens = tokenizer.encode(text, return_tensors='pt')
    max_length = 1024
    stride = 512
    lls = []

    for i in range(0, tokens.size(1), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, tokens.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = tokens[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    perplexity = torch.exp(torch.stack(lls).sum() / end_loc)
    return perplexity.item()

# Step 4: Analyze Burstiness
def calculate_burstiness(text):
    sentences = text.split('.')
    sentence_lengths = [len(sentence.split()) for sentence in sentences if sentence]

    mean_length = np.mean(sentence_lengths)
    std_dev = np.std(sentence_lengths)

    burstiness = std_dev / mean_length if mean_length else 0
    return burstiness

In [4]:
# Step 1: Upload the PDF file
uploaded = files.upload()
filenames = uploaded.keys()
lista_texts = list(filenames)
lista_texts

Saving discussion_ai_1.docx to discussion_ai_1.docx
Saving discussion_ai_2.docx to discussion_ai_2.docx
Saving discussion_ai_3.docx to discussion_ai_3.docx
Saving discussion_ai_4.docx to discussion_ai_4.docx
Saving discussion_ai_5.docx to discussion_ai_5.docx
Saving discussion_ai_6.docx to discussion_ai_6.docx
Saving discussion_ai_7.docx to discussion_ai_7.docx
Saving discussion_ai_8.docx to discussion_ai_8.docx
Saving discussion_ai_9.docx to discussion_ai_9.docx
Saving discussion_ai_10.docx to discussion_ai_10.docx
Saving discussion_ai_humanized_1.docx to discussion_ai_humanized_1.docx
Saving discussion_ai_humanized_2.docx to discussion_ai_humanized_2.docx
Saving discussion_ai_humanized_3.docx to discussion_ai_humanized_3.docx
Saving discussion_ai_humanized_4.docx to discussion_ai_humanized_4.docx
Saving discussion_ai_humanized_5.docx to discussion_ai_humanized_5.docx
Saving discussion_ai_humanized_6.docx to discussion_ai_humanized_6.docx
Saving discussion_ai_humanized_7.docx to discu

['discussion_ai_1.docx',
 'discussion_ai_2.docx',
 'discussion_ai_3.docx',
 'discussion_ai_4.docx',
 'discussion_ai_5.docx',
 'discussion_ai_6.docx',
 'discussion_ai_7.docx',
 'discussion_ai_8.docx',
 'discussion_ai_9.docx',
 'discussion_ai_10.docx',
 'discussion_ai_humanized_1.docx',
 'discussion_ai_humanized_2.docx',
 'discussion_ai_humanized_3.docx',
 'discussion_ai_humanized_4.docx',
 'discussion_ai_humanized_5.docx',
 'discussion_ai_humanized_6.docx',
 'discussion_ai_humanized_7.docx',
 'discussion_ai_humanized_8.docx',
 'discussion_ai_humanized_9.docx',
 'discussion_ai_humanized_10.docx',
 'introduction_ai_1.docx',
 'introduction_ai_2.docx',
 'introduction_ai_3.docx',
 'introduction_ai_4.docx',
 'introduction_ai_5.docx',
 'introduction_ai_6.docx',
 'introduction_ai_7.docx',
 'introduction_ai_8.docx',
 'introduction_ai_9.docx',
 'introduction_ai_10.docx',
 'introduction_ai_humanized_1.docx',
 'introduction_ai_humanized_2.docx',
 'introduction_ai_humanized_3.docx',
 'introduction_a

In [5]:
# Main execution
if __name__ == "__main__":
    lista_perplexity = []
    lista_burstiness = []
    lista_token_length = []
    for files in lista_texts:
        text = upload_and_read_word(files)
        perplexity = calculate_perplexity(text)
        lista_perplexity.append(perplexity)
        burstiness = calculate_burstiness(text)
        lista_burstiness.append(burstiness)
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        token_length = len(tokenizer.encode(text))
        lista_token_length.append(token_length)


df_texts = pd.DataFrame({'text': lista_texts, 'Token': lista_token_length,
                         'Perplexity': lista_perplexity, 'Burstiness': lista_burstiness, 'AI Metric': 0})
df_texts

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Unnamed: 0,text,Token,Perplexity,Burstiness,AI Metric
0,discussion_ai_1.docx,430,12.736505,0.181379,0
1,discussion_ai_2.docx,436,9.090506,0.230476,0
2,discussion_ai_3.docx,394,10.90615,0.184105,0
3,discussion_ai_4.docx,399,11.778743,0.25403,0
4,discussion_ai_5.docx,470,10.470787,0.223905,0
5,discussion_ai_6.docx,442,11.411176,0.234732,0
6,discussion_ai_7.docx,599,12.674426,0.132911,0
7,discussion_ai_8.docx,695,18.993662,0.18513,0
8,discussion_ai_9.docx,822,18.604256,0.294872,0
9,discussion_ai_10.docx,398,12.894266,0.191273,0


In [6]:
# Step 5: Upload the model file
from google.colab import files
uploaded = files.upload()

Saving rf_model_resample.pkl to rf_model_resample.pkl


In [9]:
#save model
import pickle

#with open("rf_model.pkl", "wb") as f:
#     pickle.dump(rf_model, f)

#load model
with open("rf_model_resample.pkl", "rb") as f:
     best_rf_resample = pickle.load(f) # Use pickle.load to load the model

In [10]:
X = df_texts.drop(columns=['text', 'AI Metric'])
#X = df_texts[['word counts', 'Token', 'Perplexity', 'character counts']]
#y = df_texts['AI Metric']

In [11]:
# Step 6: Predict if the text is AI-generated or human-written
Text_predicted = best_rf_resample.predict(X)
result = pd.DataFrame({'text': lista_texts, 'AI Predicted': Text_predicted})
result

Unnamed: 0,text,AI Predicted
0,discussion_ai_1.docx,1
1,discussion_ai_2.docx,1
2,discussion_ai_3.docx,1
3,discussion_ai_4.docx,1
4,discussion_ai_5.docx,1
5,discussion_ai_6.docx,1
6,discussion_ai_7.docx,1
7,discussion_ai_8.docx,1
8,discussion_ai_9.docx,1
9,discussion_ai_10.docx,1
