<a href="https://colab.research.google.com/github/buhlerja/APS360_Team32/blob/main/AI_Text_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import pandas as pd
import torchtext.vocab as vocab

glove_embeddings = vocab.GloVe(name='6B', dim=100)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [00:26<00:00, 14952.54it/s]


### Helper Functions

In [None]:
conjunctions = ['and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'although',
                               'because', 'since', 'while', 'after', 'before', 'when', 'if',
                               'unless', 'until', 'whether']

def remove_conjunctions(text, conjunctions):
  '''
  string, list -> string
  -
  function takes in an essay (string) and list of conjunctions (list) and outputs the input essay with
  instances of the conjunctions in the list removed
  '''
  words = text.split()
  words = [word for word in words if word.lower() not in conjunctions]
  return ' '.join(words)

import string
# Function to remove punctuation from a string
def remove_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)



### Model Definition

In [None]:
# Model code
class AI_Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AI_Classifier, self).__init__()
        self.name = "AI_Classifier"
        self.emb = nn.Embedding.from_pretrained(glove_embeddings.vectors)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size) # Output size should = 2

    def forward(self, x):
        # Look up embeddings
        # Mask out-of-range indices
        mask = (x >= 0) & (x < self.emb.num_embeddings)
        x = torch.where(mask, x, torch.zeros_like(x))
        #print(x.shape)
        x = self.emb(x)
        #print(x.shape)
        # Set the initial hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        # Forward propagate
        out, _ = self.rnn(x, h0) # Expects input 3D tensor of (batch_size, seq_length, input_size)
        # Fully connected layer
        out = self.fc(out[:, -1, :])
        # Apply softmax to get 0/1
        return F.log_softmax(out, dim=1)


### Set up Model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


modelPath = '/content/gdrive/My Drive/APS360/Project/model.pth'

# Define the model
input_size = 100  # size of word embeddings
hidden_size = 64
num_layers = 2
output_size = 2  # binary classification (human or AI generated)
seq_length = 20  # length of input sequence

# Initialize the LSTM classifier
Test_Model = AI_Classifier(input_size, hidden_size, num_layers, output_size)

# Load the saved state dictionary
state_dict = torch.load(modelPath)

# Apply the state dictionary to the model
Test_Model.load_state_dict(state_dict)

Mounted at /content/gdrive


<All keys matched successfully>

### Run Model

In [None]:
################ INPUT GOES HERE ################

essay = '"The Chrysalids" by John Wyndham immerses readers into a dystopian world fraught with societal paranoia and genetic discrimination. Set in the aftermath of a devastating nuclear holocaust, the story unfolds in the insulated community of Waknuk, where the rigid principles of purity and conformity dominate every aspect of life. The protagonist, David Strorm, grapples with the burden of his telepathic abilities, deemed blasphemous in a society that deifies genetic uniformity. As David navigates the oppressive atmosphere of Waknuk, he forms a clandestine bond with other telepathic individuals, including his cousin Rosalind. Together, they embark on a perilous journey to evade persecution and seek sanctuary in the elusive haven of Sealand. Along the way, they confront the harrowing reality of their worlds relentless pursuit of genetic perfection, as well as the moral ambiguity of their own actions in the face of adversity. Through Wyndhams masterful storytelling, "The Chrysalids" delves deep into themes of identity, prejudice, and the innate human desire for acceptance and belonging. As readers follow Davids quest for freedom and self-discovery, they are compelled to reflect on the implications of unchecked societal dogma and the enduring power of empathy and resilience in the face of adversity. With its thought-provoking narrative and richly drawn characters, "The Chrysalids" stands as a timeless testament to the enduring struggle for individuality and the human spirits capacity to transcend the confines of prejudice and fear.'
#################################################

# Build a dictionary to hold the vocabulary and each word's GloVe Index
word_to_glove_index = {}
words = essay.lower().split()  # Tokenize and convert to lowercase
for word in words:
    # Check if the word is not punctuation, is in GloVe embeddings, and not already in the vocabulary
    if word in glove_embeddings.stoi and word not in word_to_glove_index:
        word_to_glove_index[word] = glove_embeddings.stoi[word]

# Step 4: Convert essays to sequences of corresponding GloVe indices
essay_as_indices = [word_to_glove_index.get(word, -1) for word in words]  # Get GloVe index for each word
indexed_essay = essay_as_indices

# Step 6: Convert to PyTorch tensor
essays_tensor = torch.tensor(indexed_essay, dtype=torch.long)
essay_tensor = essays_tensor.reshape(1, len(essays_tensor))

result = Test_Model(essay_tensor)
probabilities = result
# Convert log probabilities to probabilities
probabilities = F.softmax(probabilities, dim=1)
result = torch.argmax(result, dim=1)

if result.item() == 0:
  print("Human-Generated")
  print("Certainty: ", probabilities[0,0].item())
elif result.item() == 1:
  print("AI-Generated")
  print("Certainty: ", probabilities[0,1].item())

AI-Generated
Certainty:  0.9999693632125854
