## TFIDF Scores & CSV Setup

In [42]:
import math
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import os
import itertools

# ( Required Downloads )
# SHELL: pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# ( TFIDF.py )
# Overview: Uses the TFIDF (Term Frequency-Inverse Document Frequency) algorithm to extract most common words from 'input' csv file

#------------------------------------------------------------------------
# Cleans text by removing: (capitalization / Removes punctuation / stopwords 'a,an,the,them,in,on,by...')
# def cleantext(text):
def cleantext(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lower case
    tokens = [token for token in tokens if token.isalpha()]  # Removes punctuation and numbers
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    # POS tagging and filtering verbs
    tagged_tokens = pos_tag(tokens)
    verbs = [word for word, tag in tagged_tokens if tag.startswith('VB')]
    
    return verbs

#------------------------------------------------------------------------
# Calculates Term Frequency (TF)
def compute_tf(text):
    tf_dictionary = {}
    count = Counter(text)
    for word, cnt in count.items():
        tf_dictionary[word] = cnt / float(len(text))
    return tf_dictionary

#------------------------------------------------------------------------
# Calculates Inverse Document Frequency (IDF)
def compute_idf(text):
    idf_dictionary = {}
    N = len(text)
    all_words = set() # Initialize idf_dictionary with all unique words across all documents
    for document in text:
        all_words.update(document.keys())
    idf_dictionary = dict.fromkeys(all_words, 0)

    for document in text:
        for word, val in document.items():
            if val > 0:
                idf_dictionary[word] += 1
    for word, val in idf_dictionary.items():
        idf_dictionary[word] = math.log(N / float(val))
    return idf_dictionary

#------------------------------------------------------------------------
# Calculates TFIDF
def compute_tfidf(tf, idfs):
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idfs[word]
    return tfidf

#------------------------------------------------------------------------
# Reads multiple columns from a single CSV file and computes TFIDF
def filtercsv(file, columns):
    df = pd.read_csv(file)
    docs = []
    for index, row in df.iterrows():
        for column in columns:
            temp_text = cleantext(row[column])
            tf = compute_tf(temp_text)
            docs.append(tf)
    idf = compute_idf(docs)
    tfidf_scores = []
    for tf in docs:
        tfidf = compute_tfidf(tf, idf)
        tfidf_scores.append(tfidf)
    return tfidf_scores

#------------------------------------------------------------------------
# Usage Example
# ( ChatGPT )
# 1 - Recite to me a sonnet from one of Shakespeare's works, also please give me your response in a csv file titled "GeneratedText1.csv"
# 2 - Make me a sonnet, inspired by Shakespeare, output this as a csv file titled "GeneratedText2.csv"
# 3 - ay something that guy Shakespeare would say, just a paragraph will do. Also output it as a csv file titled "GeneratedText3.csv"
# ( Score Output )
# 1 - TFIDF verb-scores for document 1: {'compare': 0.03118962370062803, 'thou': 0.08450863758985461, 'shake': 0.03118962370062803, 'darling': 0.03118962370062803, 'lease': 0.08450863758985461, 'heaven': 0.03118962370062803, 'dimmed': 0.08450863758985461, 'declines': 0.03118962370062803, 'changing': 0.03118962370062803, 'fade': 0.03118962370062803, 'breathe': 0.03118962370062803, 'see': 0.03118962370062803, 'gives': 0.03118962370062803}
# 2 - TFIDF verb-scores for document 2: {'compare': 0.036860464373469494, 'shake': 0.036860464373469494, 'darling': 0.036860464373469494, 'heaven': 0.036860464373469494, 'declines': 0.036860464373469494, 'changing': 0.036860464373469494, 'fade': 0.036860464373469494, 'breathe': 0.036860464373469494, 'see': 0.036860464373469494, 'lives': 0.09987384442437362, 'gives': 0.036860464373469494}
# 3 - TFIDF verb-scores for document 3: {'spin': 0.13732653608351372, 'plays': 0.13732653608351372, 'minds': 0.13732653608351372, 'knew': 0.13732653608351372, 'come': 0.13732653608351372, 'shuffled': 0.13732653608351372, 'give': 0.13732653608351372, 'pause': 0.13732653608351372}

cols = ['Response 1', 'Response 2']
claude_scores = filtercsv('claude_responses.csv', cols)
chatgpt_scores = filtercsv('gpt_responses.csv', cols)


for idx, score in enumerate(claude_scores):
    print("\n"f"TFIDF verb-scores for document {idx + 1}: {score}")

for idx, score in enumerate(chatgpt_scores):
    print("\n"f"TFIDF verb-scores for document {idx + 1}: {score}")

[nltk_data] Downloading package punkt to /Users/amyyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/amyyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amyyu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



TFIDF verb-scores for document 1: {'waited': 0.0, 'asking': 0.06461299476821136, 'alright': 0.12419543591158391, 'calls': 0.03178544972689641, 'waiting': 0.197335473922271, 'help': 0.07885110452917378, 'make': 0.1626233372336449, 'stays': 0.12113938038440349, 'providing': 0.19675827140602928, 'monitoring': 0.3129769440522753, 'arrive': 0.27207541035418453}

TFIDF verb-scores for document 2: {'waited': 0.0, 'asking': 0.0710742942450325, 'alright': 0.13661497950274232, 'calls': 0.03496399469958605, 'help': 0.17347242996418233, 'offering': 0.13297821508565486, 'arrives': 0.06435267963345243, 'find': 0.08184072274337907, 'need': 0.12059126691437626}

TFIDF verb-scores for document 3: {'waited': 0.0, 'stranger': 0.03156653999303394, 'assessing': 0.09724100151330112, 'asking': 0.04738286283002167, 'responds': 0.05144916367687779, 'call': 0.04679375737665512, 'help': 0.0578241433213941, 'find': 0.05456048182891938, 'breathing': 0.22655297480027958, 'depending': 0.2969564864169005, 'stay': 0.

In [55]:
import pandas as pd

def array2csv(array):
    # Initialize an empty list to store DataFrame objects
    dfs = []
    
    # Iterate over each dictionary in the array
    for score in array:
        # Convert the dictionary to a DataFrame with a single row
        df = pd.DataFrame([score])
        # Replace NaN values with 0
        df = df.fillna(0)
        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)
    result_df.fillna(0)

    # # Generate IDs
    ids = []
    for i in range(688): 
        for j in range(2):
            ids.append(i+1)
        
    # # Print the lengths for debugging
    # print("Length of data:", len(array))
    # print("Length of IDs:", len(ids))

    # # Add IDs to DataFrame
    result_df.insert(0, 'ID', ids)

    
    print(result_df)
    return result_df

claude_scores_df = array2csv(claude_scores)
claude_scores_df.to_csv('claude_tfidf_scores.csv', index=False)

chatgpt_scores_df = array2csv(chatgpt_scores)
chatgpt_scores_df.to_csv('claude_tfidf_scores.csv', index=False)

       ID  waited    asking   alright     calls   waiting      help      make  \
0       1     0.0  0.064613  0.124195  0.031785  0.197335  0.078851  0.162623   
1       1     0.0  0.071074  0.136615  0.034964       NaN  0.173472       NaN   
2       2     0.0  0.047383       NaN       NaN       NaN  0.057824       NaN   
3       2     0.0  0.064613       NaN       NaN  0.197335       NaN       NaN   
4       3     0.0  0.054673       NaN  0.026895       NaN  0.066720       NaN   
...   ...     ...       ...       ...       ...       ...       ...       ...   
1371  686     0.0  0.059229       NaN  0.029137       NaN       NaN       NaN   
1372  687     0.0  0.044421       NaN  0.021852       NaN  0.054210       NaN   
1373  687     0.0       NaN  0.091077  0.023309       NaN       NaN       NaN   
1374  688     0.0  0.050767       NaN  0.024974       NaN       NaN       NaN   
1375  688     0.0  0.059229  0.113846       NaN       NaN  0.072280       NaN   

         stays  providing  