In [9]:
import math
import csv
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import os
import itertools

# ( Required Downloads )
# SHELL: pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# ( TFIDF.py )
# Overview: Uses the TFIDF (Term Frequency-Inverse Document Frequency) algorithm to extract most common words from 'input' csv file

#------------------------------------------------------------------------
# Cleans text by removing: (capitalization / Removes punctuation / stopwords 'a,an,the,them,in,on,by...')
# def cleantext(text):
def cleantext(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lower case
    tokens = [token for token in tokens if token.isalpha()]  # Removes punctuation and numbers
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    # POS tagging and filtering verbs
    tagged_tokens = pos_tag(tokens)
    verbs = [word for word, tag in tagged_tokens if tag.startswith('VB')]
    
    return verbs

#------------------------------------------------------------------------
# Calculates Term Frequency (TF)
def compute_tf(text):
    tf_dictionary = {}
    count = Counter(text)
    for word, cnt in count.items():
        tf_dictionary[word] = cnt / float(len(text))
    return tf_dictionary

#------------------------------------------------------------------------
# Calculates Inverse Document Frequency (IDF)
def compute_idf(text):
    idf_dictionary = {}
    N = len(text)
    all_words = set() # Initialize idf_dictionary with all unique words across all documents
    for document in text:
        all_words.update(document.keys())
    idf_dictionary = dict.fromkeys(all_words, 0)

    for document in text:
        for word, val in document.items():
            if val > 0:
                idf_dictionary[word] += 1
    for word, val in idf_dictionary.items():
        idf_dictionary[word] = math.log(N / float(val))
    return idf_dictionary

#------------------------------------------------------------------------
# Calculates TFIDF
def compute_tfidf(tf, idfs):
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idfs[word]
    return tfidf

#------------------------------------------------------------------------
# Reads multiple columns from a single CSV file and computes TFIDF
def filtercsv(file, columns):
    df = pd.read_csv(file)
    docs = []
    for column in columns:
        text = ' '.join(df[column].dropna().astype(str).tolist())  # Combine all rows of the column into a single text
        temp_text = cleantext(text)
        tf = compute_tf(temp_text)
        docs.append(tf)
    idf = compute_idf(docs)
    tfidf_scores = []
    for tf in docs:
        tfidf = compute_tfidf(tf, idf)
        tfidf_scores.append(tfidf)
    return tfidf_scores

#------------------------------------------------------------------------
# Usage Example
# ( ChatGPT )
# 1 - Recite to me a sonnet from one of Shakespeare's works, also please give me your response in a csv file titled "GeneratedText1.csv"
# 2 - Make me a sonnet, inspired by Shakespeare, output this as a csv file titled "GeneratedText2.csv"
# 3 - ay something that guy Shakespeare would say, just a paragraph will do. Also output it as a csv file titled "GeneratedText3.csv"
# ( Score Output )
# 1 - TFIDF verb-scores for document 1: {'compare': 0.03118962370062803, 'thou': 0.08450863758985461, 'shake': 0.03118962370062803, 'darling': 0.03118962370062803, 'lease': 0.08450863758985461, 'heaven': 0.03118962370062803, 'dimmed': 0.08450863758985461, 'declines': 0.03118962370062803, 'changing': 0.03118962370062803, 'fade': 0.03118962370062803, 'breathe': 0.03118962370062803, 'see': 0.03118962370062803, 'gives': 0.03118962370062803}
# 2 - TFIDF verb-scores for document 2: {'compare': 0.036860464373469494, 'shake': 0.036860464373469494, 'darling': 0.036860464373469494, 'heaven': 0.036860464373469494, 'declines': 0.036860464373469494, 'changing': 0.036860464373469494, 'fade': 0.036860464373469494, 'breathe': 0.036860464373469494, 'see': 0.036860464373469494, 'lives': 0.09987384442437362, 'gives': 0.036860464373469494}
# 3 - TFIDF verb-scores for document 3: {'spin': 0.13732653608351372, 'plays': 0.13732653608351372, 'minds': 0.13732653608351372, 'knew': 0.13732653608351372, 'come': 0.13732653608351372, 'shuffled': 0.13732653608351372, 'give': 0.13732653608351372, 'pause': 0.13732653608351372}

files = ['Response 1', 'Response 2']
claude = filtercsv('claude_responses.csv', files)
chatgpt = filtercsv('gpt_responses.csv', files)
tfidf_scores_list = list(itertools.chain(list1, list2))

for idx, tfidf_scores in enumerate(tfidf_scores_list):
    print("\n"f"TFIDF verb-scores for document {idx + 1}: {tfidf_scores}")

[nltk_data] Downloading package punkt to /Users/amyyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/amyyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amyyu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



TFIDF verb-scores for document 1: {'waited': 0.0, 'asking': 0.05021587515262393, 'alright': 0.12290957160375078, 'calls': 0.03348827124562942, 'waiting': 0.16917818213376706, 'help': 0.08469727427760389, 'make': 0.1550431909664584, 'stays': 0.12342173685086495, 'providing': 0.17003179489284334, 'monitoring': 0.27340257393792566, 'arrive': 0.25638334283900327}

TFIDF verb-scores for document 2: {'waited': 0.0, 'stranger': 0.029649597499099848, 'assessing': 0.07975005054880172, 'asking': 0.03682497511192421, 'responds': 0.04979276377110906, 'call': 0.04081402788596854, 'help': 0.06211133447024285, 'find': 0.04837642319685999, 'breathing': 0.2326177600139947, 'depending': 0.25964876722120567, 'stay': 0.20453686234224114, 'monitor': 0.38937611049155985, 'arrives': 0.04392387380542088, 'encourage': 0.43558592252888956, 'remains': 0.15627560639379454}

TFIDF verb-scores for document 3: {'waited': 0.0, 'stranger': 0.034211074037422906, 'assessing': 0.09201928909477121, 'try': 0.1146433631549

In [7]:
print(tfidf_scores_list)

[{'waited': 0.0, 'asking': 0.05021587515262393, 'alright': 0.12290957160375078, 'calls': 0.03348827124562942, 'waiting': 0.16917818213376706, 'help': 0.08469727427760389, 'make': 0.1550431909664584, 'stays': 0.12342173685086495, 'providing': 0.17003179489284334, 'monitoring': 0.27340257393792566, 'arrive': 0.25638334283900327}, {'waited': 0.0, 'stranger': 0.029649597499099848, 'assessing': 0.07975005054880172, 'asking': 0.03682497511192421, 'responds': 0.04979276377110906, 'call': 0.04081402788596854, 'help': 0.06211133447024285, 'find': 0.04837642319685999, 'breathing': 0.2326177600139947, 'depending': 0.25964876722120567, 'stay': 0.20453686234224114, 'monitor': 0.38937611049155985, 'arrives': 0.04392387380542088, 'encourage': 0.43558592252888956, 'remains': 0.15627560639379454}, {'waited': 0.0, 'stranger': 0.034211074037422906, 'assessing': 0.09201928909477121, 'try': 0.11464336315493054, 'asking': 0.04249035589837409, 'responds': 0.057453188966664306, 'call': 0.047093109099194466, '