# Generating Documents

# Using GPT-2 for Documents Generation

In [31]:
from transformers import pipeline, set_seed

def generate_simple_document(phrase, seed=42, max_length=200):

    set_seed(seed)
    generator = pipeline('text-generation', model='gpt2')
    generated = generator(phrase, max_length=max_length, num_return_sequences=1)
    return generated[0]['generated_text']

# Different phrases for each field
phrases = {
    "technology": "The impact of artificial intelligence in our world",
    "healthcare": "healthcare field have seen major advancements ",
    "finance": "Emerging trends in personal finance management tools for the digital age"
}

documents = []
for field, phrase in phrases.items():
    document = generate_simple_document(phrase)
    documents.append(document)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=

In [35]:
documents

['The impact of artificial intelligence in our world is a massive one. Not just with information about everything in our bodies being spread in vast, uncontrolled regions, but with our brain being used so cheaply that we only pay attention when we smell this stuff.\n\nThis is also true while we are running away with things. In a perfect world our brains weren\'t such a big deal, we wouldn\'t need money to care if our brains were infected by a virus that\'s like Ebola. It wouldn\'t be an issue at all. We\'d be fine. It\'s not even a big worry.\n\n"Human intelligence, like everything else in the world – it can solve any problem – does not always solve all. No human has solved the world as successfully by our own efforts in solving it. Humans do understand our challenges of getting more and more smarter. And in fact, they understand the complexities we face, when it comes to making the most of them."\n\nWe\'re going to turn you',
 "healthcare field have seen major advancements \xa0over ti

# Preprocessing documents

In [36]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_document(document, stem =True):
    
    # Break into lines and remove leading/trailing spaces
    lines = (line.strip() for line in document.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    cleaned_text = " ".join(chunks)

    # Tokenization
    tokens = word_tokenize(cleaned_text)

    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    if stem:
        # Get unique words using a set
        unique_words = set(stemmed_tokens)
    else:
        unique_words = set(filtered_tokens)

    return unique_words

# TF-IDF Builtin

In [37]:
cleaned_docs = []
for doc in documents:
    cleaned_docs.append(preprocess_document(doc))

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Joining the words in each document back into a single string
preprocessed_texts = [" ".join(doc) for doc in cleaned_docs]

# Initialize the TfidfVectorizer 
vectorizer = TfidfVectorizer(stop_words = stopwords.words("english"), norm ='l2') # By default norm = l2: Sum of squares of vector elements is 1

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# The tfidf_matrix now contains the Normalized TF-IDF scores. 
# Here's how you can get the feature names and scores for each document
feature_names = vectorizer.get_feature_names_out()
for doc_idx, doc in enumerate(tfidf_matrix):
    print(f"Document {doc_idx+1} TF-IDF Scores:")
    feature_index = doc.nonzero()[1]
    tfidf_scores = zip(feature_index, [doc[0, x] for x in feature_index])
    for word, score in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        print(f"  {word}: {score:.4f}")
    print()

Document 1 TF-IDF Scores:
  turn: 0.1004
  going: 0.1004
  making: 0.1004
  comes: 0.1004
  face: 0.1004
  complexities: 0.1004
  fact: 0.1004
  smarter: 0.1004
  getting: 0.1004
  challenges: 0.1004
  understand: 0.2008
  humans: 0.1004
  solving: 0.1004
  efforts: 0.1004
  successfully: 0.1004
  solved: 0.1004
  always: 0.1004
  problem: 0.1004
  solve: 0.2008
  else: 0.1004
  human: 0.2008
  worry: 0.1004
  even: 0.1004
  fine: 0.1004
  issue: 0.1004
  ebola: 0.1004
  like: 0.2008
  virus: 0.1004
  infected: 0.1004
  care: 0.0764
  money: 0.1004
  need: 0.1004
  deal: 0.1004
  big: 0.2008
  brains: 0.2008
  perfect: 0.1004
  things: 0.1004
  away: 0.1004
  running: 0.1004
  true: 0.1004
  also: 0.1004
  stuff: 0.1004
  smell: 0.1004
  attention: 0.1004
  pay: 0.1004
  cheaply: 0.1004
  used: 0.1004
  brain: 0.1004
  regions: 0.1004
  uncontrolled: 0.1004
  vast: 0.1004
  spread: 0.1004
  bodies: 0.1004
  everything: 0.2008
  information: 0.1004
  one: 0.0764
  massive: 0.1004
  worl

In [39]:
print("Matrix Shape  = " , tfidf_matrix.toarray().shape)

Matrix Shape  =  (3, 193)


In [40]:
# Get the TF-IDF score of the first word in the first document
word_index = 0  
doc_index = 0 
print(tfidf_matrix[doc_index, :]) # Prints only non-zero values

  (0, 174)	0.10042428562400375
  (0, 77)	0.10042428562400375
  (0, 104)	0.10042428562400375
  (0, 37)	0.10042428562400375
  (0, 66)	0.10042428562400375
  (0, 40)	0.10042428562400375
  (0, 67)	0.10042428562400375
  (0, 156)	0.10042428562400375
  (0, 76)	0.10042428562400375
  (0, 35)	0.10042428562400375
  (0, 178)	0.2008485712480075
  (0, 85)	0.10042428562400375
  (0, 160)	0.10042428562400375
  (0, 59)	0.10042428562400375
  (0, 165)	0.10042428562400375
  (0, 159)	0.10042428562400375
  (0, 11)	0.10042428562400375
  (0, 136)	0.10042428562400375
  (0, 158)	0.2008485712480075
  (0, 60)	0.10042428562400375
  (0, 84)	0.2008485712480075
  (0, 191)	0.10042428562400375
  (0, 64)	0.10042428562400375
  (0, 73)	0.10042428562400375
  (0, 92)	0.10042428562400375
  :	:
  (0, 167)	0.10042428562400375
  (0, 23)	0.10042428562400375
  (0, 146)	0.10042428562400375
  (0, 173)	0.10042428562400375
  (0, 10)	0.10042428562400375
  (0, 164)	0.10042428562400375
  (0, 157)	0.10042428562400375
  (0, 21)	0.1004242856

# BONUS : TF-IDF from scratch

In [41]:
# Without stemmization as the default 
processed_docs = []
for d in documents:
    processed_docs.append(list(preprocess_document(d, False)))

In [42]:
from collections import Counter
import math
import numpy as np



vocabulary = set(word for doc in processed_docs for word in doc)
vocabulary = sorted(list(vocabulary))
vocab_index = {word: i for i, word in enumerate(vocabulary)}

# 1. Calculate TF for each word in each document, but now using a vector
tf_vectors = np.zeros((len(processed_docs), len(vocabulary)))
for doc_idx, doc in enumerate(processed_docs):
    tf_doc = Counter(doc)
    for word, count in tf_doc.items():
        if word in vocab_index:  # Check if word is in the vocabulary
            tf_vectors[doc_idx, vocab_index[word]] = count / len(doc)

# 2. Calculate IDF for each word using the same smoothing formula as scikit-learn
df_counts = np.zeros(len(vocabulary))
for doc in processed_docs:
    for word in set(doc):
        if word in vocab_index:
            df_counts[vocab_index[word]] += 1
            
idf_scores = np.log((1 + len(processed_docs)) / (1 + df_counts)) + 1  # Smoothing

# 3. Calculate TF-IDF scores
tfidf_matrix2 = tf_vectors * idf_scores

# 4. Normalize TF-IDF vectors (L2 norm)
tfidf_norm = np.linalg.norm(tfidf_matrix2, axis=1, keepdims=True)
scratch_tfidf_matrix = tfidf_matrix2 / tfidf_norm

# Check the shape of the matrix
print("Shape of the custom TF-IDF matrix:", scratch_tfidf_matrix.shape)

Shape of the custom TF-IDF matrix: (3, 193)


# Comparing Vocab of Built IN and From Scratch

In [43]:
for i, j in zip(vocabulary, vectorizer.get_feature_names_out()):
    print(i, " - ", j)

aca  -  aca
according  -  according
accounted  -  accounted
act  -  act
addition  -  addition
advancements  -  advancements
affordable  -  affordable
age  -  age
alcoholism  -  alcoholism
allow  -  allow
also  -  also
always  -  always
americans  -  americans
among  -  among
app  -  app
applications  -  applications
approximately  -  approximately
apps  -  apps
area  -  area
artificial  -  artificial
associated  -  associated
attention  -  attention
average  -  average
away  -  away
best  -  best
big  -  big
biggest  -  biggest
blood  -  blood
bodies  -  bodies
brain  -  brain
brains  -  brains
card  -  card
care  -  care
cash  -  cash
certain  -  certain
challenges  -  challenges
cheaply  -  cheaply
comes  -  comes
compared  -  compared
competitive  -  competitive
complexities  -  complexities
conditions  -  conditions
conducted  -  conducted
consequences  -  consequences
consumer  -  consumer
consumers  -  consumers
cost  -  cost
coverage  -  coverage
covered  -  covered
deal  -  dea

# Comparing IDF Calculations

In [44]:
# After fitting TfidfVectorizer to your documents
idf_values_builtin = vectorizer.idf_
# Access your custom IDF values
idf_values_custom = idf_scores  

for i , j in zip(idf_values_builtin ,idf_values_custom):
    print(i , " - " ,j)

1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.2876820724517808  -  1.2876820724517808
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.6931471805599454
1.6931471805599454  -  1.693147180

# Comparing Matrix values 

In [52]:
doc_idx= 2
doc = tfidf_matrix[doc_idx].toarray().flatten()

print(f"Document {doc_idx+1} TF-IDF Scores:")
for idx, score in enumerate(doc[:10]):
    print(f"  {feature_names[idx]}: {score:.4f} - {scratch_tfidf_matrix[doc_idx, idx]}")

Document 3 TF-IDF Scores:
  aca: 0.0000 - 0.0
  according: 0.0000 - 0.0
  accounted: 0.1132 - 0.13276197705502987
  act: 0.0000 - 0.0
  addition: 0.0000 - 0.0
  advancements: 0.0000 - 0.0
  affordable: 0.0000 - 0.0
  age: 0.0430 - 0.10096890555047883
  alcoholism: 0.0000 - 0.0
  allow: 0.0000 - 0.0
