In [1]:
import math
from collections import Counter
import pandas as pd
import os

In [2]:
def parse(row):
        return (row['_id'], str(row['tokens']).split(","))

# Load tokenized documents from the file
def load_tokenized_documents():
    files_paths=os.listdir("Data/")
    corpus_tokens_paths= [f"Data/{path}" for path in files_paths if 'corpus_tokens' in path]
    corpus_tokens_paths.sort()
    dfs = [pd.read_csv(path) for path in corpus_tokens_paths]
    corpus_tokens = pd.concat(dfs, ignore_index=True)


    return corpus_tokens.apply(parse, axis=1).to_numpy()

In [3]:

s = 0.2 ## bias for normalization

# Compute Term Frequency (TF) for each term in a document and normalize it using the pivoted unique query normalization
def compute_tf(document, average_number_words):
    word_counts = Counter(document)
    unique_words_count = len(set(document))
    tf = {word: (count / max(word_counts.values())) / ((1.0-s)*average_number_words + s*unique_words_count) for word, count in word_counts.items()}
    return tf

In [4]:
import math
from collections import defaultdict

# Compute Inverse Document Frequency (IDF) for each term in all documents
def compute_idf(documents):
    total_documents = len(documents)
    word_document_count = defaultdict(int)


    average_number_words = 0 #global variable to compute average number f words per document

    for _, document in documents:
        unique_words = set(document)
        average_number_words += len(unique_words)
        for word in unique_words:
            word_document_count[word] += 1
    average_number_words = average_number_words / total_documents

    idf = {}
    for word, count in word_document_count.items():
        idf[word] = math.log(total_documents / (count))

    return average_number_words, idf

In [5]:

# Compute TF-IDF weights for each term in all documents
def compute_tfidf(documents, idf, average_number_words):
    tfidf_matrix = []
    for document_id, document in documents:
        tfidf = {}
        tf = compute_tf(document, average_number_words)
        tfidf['_id'] = document_id
        tfidf.update({word: tf[word] * idf[word] for word in tf.keys()})
        tfidf_matrix.append(tfidf)
    return tfidf_matrix

In [3]:
documents = load_tokenized_documents()
# Compute IDF values
average_number_words, idf = compute_idf(documents)

# Compute TF-IDF matrix
tfidf_matrix = compute_tfidf(documents, idf, average_number_words)

In [7]:
tfidf_matrix[:10] #test, print words weights of first 10 documents

[{'_id': 1867825,
  'after': 0.045143936544719286,
  'invent': 0.057796887331617336,
  'cotton': 0.2577714393170704,
  'gin': 0.08227795507061093,
  'becam': 0.04862125333963856,
  'americaâ\x80\x99': 0.07378541271548032,
  'lead': 0.040920434541537744,
  'crop': 0.06173035925043257,
  'king': 0.051467023463162336,
  'in': 0.02382279342620056,
  '1790': 0.08341432873009207,
  'america': 0.044722088583476545,
  'produc': 0.03676162434783516,
  '1500': 0.05940642318746022,
  'pound': 0.09673792591930626,
  'by': 0.04468773609115413,
  '1800': 0.06500439969774575,
  'product': 0.03442392774520331,
  'increas': 0.037255966197012644,
  '35000': 0.07538886242997951},
 {'_id': 419610,
  'timer': 0.07643529357101413,
  'separ': 0.044711526377267434,
  'night': 0.13565654896419382,
  'day': 0.027602145212891428,
  'outlet': 0.1290334543902686,
  'nice': 0.057509240126138235,
  'time': 0.024096932643871912,
  'set': 0.03404171519687969,
  'rotat': 0.056069135553502375,
  '24': 0.0433831659893449

In [8]:
print(sum(n for _, n in list(tfidf_matrix[0].items())[1:])) ##test, need sum of values close to 1

1.3211407779297357


In [5]:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def joinDocument(document):
    return ' '.join(document[1])

b = list(map(joinDocument, documents))

print("mapping done")

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(b)

print("fitting done")

print(np.where(features.toarray()[0] == 0)[0]) # print non-zero indices of first document

mapping done
fitting done


MemoryError: Unable to allocate 10.5 TiB for an array with shape (1471406, 985321) and data type float64

In [8]:
# Reorder columns so that 'document_id' is the first column
df = df[['document_id'] + [col for col in df.columns if col != 'document_id']]

# Export the DataFrame to a CSV file
output_csv_file = "tfidf_matrix.csv"
df.to_csv(output_csv_file, index=False)

print(f"TF-IDF matrix has been exported to {output_csv_file}")


TF-IDF matrix has been exported to tfidf_matrix.csv
