In [None]:
# Cell 1
# Import NLTK library for text processing
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
import math
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# Download the 'punkt' model data for NLTK
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load the datasets
human_essays = pd.read_csv("/content/train_essays.csv")
llm_essays = pd.read_csv("/content/chatGPT_essays.csv")

# Combine datasets
combined_essays = pd.concat([human_essays, llm_essays], ignore_index=True)
combined_essays

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1658,358e658e,0,A man is driving in his car recklessly in a hu...,1
1659,35cb12f4,0,Their are so many things you can do to help ke...,1
1660,35cdfc71,1,"""Do voters really control whom their elector p...",1
1661,35db0aa4,1,"Dear, I believe that the way of voting for a p...",1


In [None]:
training_set, development_set = train_test_split(combined_essays, test_size=0.1, random_state=42)

In [None]:
def create_lexicon(data, min_frequency=5):
    word_counts = Counter()
    for essay in data['text']:
        words = word_tokenize(essay.lower())
        word_counts.update(word for word in words if word.isalpha())

    lexicon = [word for word, freq in word_counts.items() if freq >= min_frequency]
    reverse_dictionary = {word: idx for idx, word in enumerate(lexicon)}

    return lexicon, reverse_dictionary

lexicon, reverse_dictionary = create_lexicon(combined_essays)

In [None]:
def calculate_probabilities_with_smoothing(data, lexicon, alpha=1.0):
    class_word_counts = {class_label: Counter() for class_label in ['Human', 'LLM']}
    total_word_counts = Counter()
    for _, row in data.iterrows():
        class_label = 'Human' if row['generated'] == 0 else 'LLM'
        words = set(word_tokenize(row['text'].lower()))
        total_word_counts.update(words)
        for word in words:
            if word in lexicon:
                class_word_counts[class_label][word] += 1

    total_docs = len(data)
    total_docs_per_class = {'Human': len(data[data['generated'] == 0]), 'LLM': len(data[data['generated'] == 1])}

    probabilities = {word: {class_label: (class_word_counts[class_label][word] + alpha) / (total_docs_per_class[class_label] + alpha * len(lexicon))
                            for class_label in ['Human', 'LLM']} for word in lexicon}

    word_occurrence_probabilities = {word: total_word_counts[word] / total_docs for word in lexicon}
    return probabilities, word_occurrence_probabilities


In [None]:
# Using the modified function with smoothing
alpha_value = 1.0
word_probabilities, _ = calculate_probabilities_with_smoothing(training_set, lexicon, alpha_value)
# Print the first 5 entries of word_probabilities
print("First 5 entries of word_probabilities:")
for word in list(word_probabilities.keys())[:5]:
    print(f"{word}: {word_probabilities[word]}")


First 5 entries of word_probabilities:
cars: {'Human': 0.08933129147524248, 'LLM': 0.04652079637295486}
have: {'Human': 0.16862344733707674, 'LLM': 0.08850778631973191}
been: {'Human': 0.0852475752935171, 'LLM': 0.04592943031736645}
around: {'Human': 0.05768249106687085, 'LLM': 0.03429923122412774}
since: {'Human': 0.05121660711247235, 'LLM': 0.02877981470530258}


In [None]:
# Print the first 5 entries of word_occurrence_probabilities
# Ensure this line is present in your code after the modified function call
_, word_occurrence_probabilities = calculate_probabilities_with_smoothing(training_set, lexicon, alpha_value)
print("\nFirst 5 entries of word_occurrence_probabilities:")
for word in list(word_occurrence_probabilities.keys())[:5]:
    print(f"{word}: {word_occurrence_probabilities[word]}")


First 5 entries of word_occurrence_probabilities:
cars: 0.5073529411764706
have: 0.9612299465240641
been: 0.4893048128342246
around: 0.34157754010695185
since: 0.29745989304812837


In [None]:
def essay_classifier(essay, lexicon, word_probabilities):
    tokens = word_tokenize(essay.lower())
    scores = {'Human': 0, 'LLM': 0}
    for token in tokens:
        if token in lexicon:
            scores['Human'] += math.log(word_probabilities[token]['Human'])
            scores['LLM'] += math.log(word_probabilities[token]['LLM'])
    return 'Human' if scores['Human'] > scores['LLM'] else 'LLM'

def evaluate_classifier(data, classifier, lexicon, word_probabilities):
    correct_predictions = 0
    for _, row in data.iterrows():
        prediction = classifier(row['text'], lexicon, word_probabilities)
        actual = 'Human' if row['generated'] == 0 else 'LLM'
        if prediction == actual:
            correct_predictions += 1
    return correct_predictions / len(data)

classifier_accuracy = evaluate_classifier(development_set, essay_classifier, lexicon, word_probabilities)
print(f"Classifier Accuracy: {classifier_accuracy:.2f}")


Classifier Accuracy: 0.71


In [None]:
def smoothing_analysis(dataset, lexicon, alpha_values):
    for alpha in alpha_values:
        probabilities, _ = calculate_probabilities_with_smoothing(dataset, lexicon, alpha)
        accuracy = evaluate_classifier(dataset, essay_classifier, lexicon, probabilities)
        print(f"Accuracy with alpha {alpha}: {accuracy:.2f}")
# Perform smoothing analysis
smoothing_analysis(development_set, lexicon, [0.5, 1, 1.5, 2])

Accuracy with alpha 0.5: 0.71
Accuracy with alpha 1: 0.71
Accuracy with alpha 1.5: 0.71
Accuracy with alpha 2: 0.71


In [None]:
def top_predictive_words(word_probabilities, top_n=10):
    word_importance_human = {word: word_probabilities[word]['Human'] / (word_probabilities[word]['LLM'] + 1e-6) for word in word_probabilities}
    word_importance_llm = {word: word_probabilities[word]['LLM'] / (word_probabilities[word]['Human'] + 1e-6) for word in word_probabilities}

    top_words_human = sorted(word_importance_human.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_words_llm = sorted(word_importance_llm.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return top_words_human, top_words_llm

top_words_human, top_words_llm = top_predictive_words(word_probabilities)

print("Top Predictive Words for Human Essays:", [word for word, _ in top_words_human])
print("Top Predictive Words for LLM Essays:", [word for word, _ in top_words_llm])


Top Predictive Words for Human Essays: ['carolina', 'nobody', 'ideal', 'chooses', 'governments', 'broken', 'guarantee', 'mental', 'promoted', 'developing']
Top Predictive Words for LLM Essays: ['progressive', 'campain', 'ancient', 'efficiant', 'longstanding', 'heavier', 'arm', 'greetings', 'reson', 'oversee']
