In [1]:
# Install required libraries for classifying real texts
!pip install --upgrade docx2txt
!pip install transformers==4.23.0

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3959 sha256=27239cbca846571543f359bfe489b872da1db12216b5a3c2647154bd9ea301b8
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8
Collecting transformers==4.23.0
  Downloading transformers-4.23.0-py3-none-any.whl.metadata (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.23.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloadi

In [10]:
import pandas as pd
import numpy as np
import io
from IPython.display import display, clear_output
import ipywidgets as widgets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
import docx2txt
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm.notebook import tqdm

# Upload text characterization file
upload_char_file = widgets.FileUpload(
    accept='.xlsx', multiple=False, description="Upload Text Characterization File"
)
upload_docx_files = widgets.FileUpload(
    accept='.docx', multiple=True, description="Upload DOCX Files"
)

# Display widgets
display(upload_char_file, upload_docx_files)

output = widgets.Output()

@output.capture(clear_output=True)
def process_files(change):
    if upload_char_file.value and upload_docx_files.value:
        # Process Text Characterization File
        char_file = upload_char_file.value
        char_file_name = next(iter(char_file))
        df = pd.read_excel(io.BytesIO(char_file[char_file_name]['content']))

        # Prepare features and labels
        X = df.drop(columns=['index', 'text', 'AI Metric'])
        y = df['AI Metric']

        # Split dataset
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Train classifiers
        classifiers = {
            'ExtraTrees': ExtraTreesClassifier(random_state=42),
            'RandomForest': RandomForestClassifier(random_state=42),
            'GaussianNB': GaussianNB()
        }
        metrics = ['accuracy', 'precision', 'recall', 'f1', 'mcc', 'kappa']
        results = {}

        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred_train = clf.predict(X_train)
            y_pred_test = clf.predict(X_test)

            results[name] = {
                'train': {
                    'accuracy': accuracy_score(y_train, y_pred_train),
                    'precision': precision_score(y_train, y_pred_train),
                    'recall': recall_score(y_train, y_pred_train),
                    'f1': f1_score(y_train, y_pred_train),
                    'mcc': matthews_corrcoef(y_train, y_pred_train),
                    'kappa': cohen_kappa_score(y_train, y_pred_train),
                    'confusion_matrix': confusion_matrix(y_train, y_pred_train)
                },
                'test': {
                    'accuracy': accuracy_score(y_test, y_pred_test),
                    'precision': precision_score(y_test, y_pred_test),
                    'recall': recall_score(y_test, y_pred_test),
                    'f1': f1_score(y_test, y_pred_test),
                    'mcc': matthews_corrcoef(y_test, y_pred_test),
                    'kappa': cohen_kappa_score(y_test, y_pred_test),
                    'confusion_matrix': confusion_matrix(y_test, y_pred_test)
                }
            }

        # Select best model
        train_scores = {name: [results[name]['train'][metric] for metric in metrics] for name in classifiers.keys()}
        best_model = max(train_scores, key=lambda x: np.mean(train_scores[x]))
        best_clf = classifiers[best_model]

        # Display training metrics
        df_train_metrics = pd.DataFrame({model: results[model]['train'] for model in results}).T
        df_train_metrics.index.name = 'Model'
        print(f"Training results:")
        print(f"Best Model: {best_model} based on training scores")
        display(df_train_metrics)

        # Show confusion matrix for training
        cm_train = results[best_model]['train']['confusion_matrix']
        ConfusionMatrixDisplay(confusion_matrix=cm_train).plot()
        plt.title("Training Confusion Matrix")
        plt.show()

        # Display testing metrics
        df_test_metrics = pd.DataFrame({model: results[model]['test'] for model in results}).T
        df_test_metrics.index.name = 'Model'
        print(f"Testing results:")
        display(df_test_metrics)

        # Show confusion matrix for testing
        cm_test = results[best_model]['test']['confusion_matrix']
        ConfusionMatrixDisplay(confusion_matrix=cm_train).plot()
        plt.title("Testing Confusion Matrix")
        plt.show()

        # Process DOCX Files
        docx_files = upload_docx_files.value
        lista_texts, lista_perplexity, lista_burstiness, lista_token_length = [], [], [], []

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')

        for name, file in tqdm(docx_files.items(), desc="Processing DOCX Files"): # Wrap with tqdm
            text = docx2txt.process(io.BytesIO(file['content'])).replace('\n', '')
            lista_texts.append(name)

            # Perplexity
            tokens = tokenizer.encode(text, return_tensors='pt')
            max_length = 1024
            stride = 512
            lls = []

            for i in range(0, tokens.size(1), stride):
                begin_loc = max(i + stride - max_length, 0)
                end_loc = min(i + stride, tokens.size(1))
                trg_len = end_loc - i
                input_ids = tokens[:, begin_loc:end_loc]
                target_ids = input_ids.clone()
                target_ids[:, :-trg_len] = -100

                with torch.no_grad():
                    outputs = model(input_ids, labels=target_ids)
                    log_likelihood = outputs[0] * trg_len

                lls.append(log_likelihood)

            perplexity = torch.exp(torch.stack(lls).sum() / end_loc)
            lista_perplexity.append(perplexity.item())

            # Burstiness
            sentence_lengths = [len(sentence.split()) for sentence in text.split('.') if sentence]
            burstiness = np.std(sentence_lengths) / np.mean(sentence_lengths) if sentence_lengths else 0
            lista_burstiness.append(burstiness)

            # Token Length
            token_length = len(tokenizer.encode(text))
            lista_token_length.append(token_length)

        df_texts = pd.DataFrame({
            'text': lista_texts,
            'Token': lista_token_length,
            'Perplexity': lista_perplexity,
            'Burstiness': lista_burstiness
        })

        # Predict on DOCX data
        X_texts = df_texts[['Token', 'Perplexity', 'Burstiness']]
        X_texts_scaled = StandardScaler().fit_transform(X_texts)
        X_texts_scaled = pd.DataFrame(X_texts_scaled, columns=['Token', 'Perplexity', 'Burstiness'])  # Assign feature names
        y_pred_texts = best_clf.predict(X_texts_scaled)
        y_proba_texts = best_clf.predict_proba(X_texts_scaled)[:, 1]

        # Display results
        texts_results = pd.DataFrame({"Prediction": y_pred_texts, "Probability of being AI": y_proba_texts})
        print("Prediction results:")
        # Concatenate df_texts and df_texts
        concatenated_df = pd.concat([df_texts, texts_results], ignore_index=False, axis=1)

        # Display the concatenated dataframe
        display(concatenated_df)

upload_char_file.observe(process_files, names='value')
upload_docx_files.observe(process_files, names='value')
display(output)

FileUpload(value={}, accept='.xlsx', description='Upload Text Characterization File')

FileUpload(value={}, accept='.docx', description='Upload DOCX Files', multiple=True)

Output()