In [29]:
import math
from collections import Counter
import pandas as pd

In [30]:

# Load tokenized documents from the file
def load_tokenized_documents(file_path, num_documents=1000):
    documents = []
    with open(file_path, 'r') as file:
        next(file)  # skip the header line
        for line in file:
            document_id, tokens = line.strip().split(',', 1)
            documents.append((document_id, tokens.split(',')))
            if len(documents) >= num_documents:
                break
    return documents

In [31]:

# Compute Term Frequency (TF) for each term in a document
def compute_tf(document):
    word_counts = Counter(document)
    total_words = len(document)
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

In [32]:

# Compute Inverse Document Frequency (IDF) for each term in all documents
def compute_idf(documents):
    total_documents = len(documents)
    all_words = set(word for _, document in documents for word in document)
    idf = {}
    for word in all_words:
        document_count = sum(1 for _, document in documents if word in document)
        idf[word] = math.log(total_documents / (document_count + 1))  # Add 1 to avoid division by zero
    return idf

In [33]:

# Compute TF-IDF weights for each term in all documents
def compute_tfidf(documents, idf):
    tfidf_matrix = []
    for document_id, document in documents:
        tf = compute_tf(document)
        tfidf = {word: tf[word] * idf[word] for word in tf.keys()}
        tfidf['document_id'] = document_id
        tfidf_matrix.append(tfidf)
    return tfidf_matrix

In [34]:
# Load tokenized documents (first 100) from the file
file_path = "/Users/mathiaskroismoller/DIS/Projects/corpus_tokens.txt"
documents = load_tokenized_documents(file_path, num_documents=1000)

# Compute IDF values
idf = compute_idf(documents)

# Compute TF-IDF matrix
tfidf_matrix = compute_tfidf(documents, idf)

# Create a DataFrame from the TF-IDF matrix
df = pd.DataFrame(tfidf_matrix)

In [35]:
tfidf_matrix

[{'"after': 0.23006087157759358,
  'invent': 0.23006087157759358,
  'cotton': 0.9202434863103743,
  'gin': 0.2589420041009246,
  'becam': 0.21316649207308674,
  'americaâ\x80\x99': 0.24204762459641782,
  'lead': 0.15717754429387448,
  'crop': 0.24204762459641782,
  'king': 0.2067435470802843,
  'in': 0.11863801118482156,
  '1790': 0.2589420041009246,
  'america': 0.18791083359099026,
  'produc': 0.1590297010676592,
  '1500': 0.24204762459641782,
  'pound': 0.21316649207308674,
  'by': 0.21316649207308674,
  '1800': 0.2589420041009246,
  'product': 0.16739098004524883,
  'increas': 0.1475191437081526,
  '35000': 0.2589420041009246,
  'pound"': 0.2589420041009246,
  'document_id': '1867825'},
 {'"timer': 0.1883214575279452,
  'separ': 0.15035894333111585,
  'night': 0.4510768299933476,
  'day': 0.09265477809915389,
  'outlet': 0.1883214575279452,
  'nice': 0.15035894333111585,
  'time': 0.07616079164574843,
  'set': 0.09909603391684095,
  'rotat': 0.155030176053154,
  '24': 0.14274335459

In [36]:
# Reorder columns so that 'document_id' is the first column
df = df[['document_id'] + [col for col in df.columns if col != 'document_id']]

# Export the DataFrame to a CSV file
output_csv_file = "tfidf_matrix.csv"
df.to_csv(output_csv_file, index=False)

print(f"TF-IDF matrix has been exported to {output_csv_file}")


TF-IDF matrix has been exported to tfidf_matrix.csv
