In [None]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os
from transformers import pipeline

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Preprocessing function
def preprocess_document(document):
    # print(document)
    # print(type(document))
    # Normalization
    document = document.lower()
    # Cleaning
    document = re.sub(r'[^a-zA-Z\s]', ' ', document)
    # Tokenization
    tokens = word_tokenize(document)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    filtered_tokens = [word for word in tokens if word not in stop_words]


    return ' '.join(filtered_tokens)

In [None]:
def get_unique(words):
# Identify Unique Words
    words = words.split()
    unique_words = list(set(words))
    return unique_words

In [None]:
# Generate texts based on prompts

def generate_documents(prompts):
    generated_docs =[]
    for i, prompt in enumerate(prompts, 1):
        output = generator(prompt, max_length=300 + 50*i, do_sample=True, temperature=0.9)
        generated_docs.append(output)
    return generated_docs

In [None]:
class CustomTFIDF:
    def __init__(self):
        self.idf_ = {}
        self.vocab_ = {}

    def fit_transform(self, documents):
        tf = []
        doc_count = len(documents)

        # Compute term frequencies and document frequencies for IDF
        for document in documents:
            doc_tf = {}
            words = document.split()
            for word in words:
                doc_tf[word] = doc_tf.get(word, 0) + 1
            for word in doc_tf:
                doc_tf[word] = doc_tf[word] / len(words)
                self.idf_[word] = self.idf_.get(word, 0) + 1
            tf.append(doc_tf)

        # Sort the vocabulary alphabetically and assign indices
        sorted_vocab = sorted(self.idf_.keys())
        self.vocab_ = {word: idx for idx, word in enumerate(sorted_vocab)}

        # Compute IDF using the sorted vocabulary
        for word in self.idf_:
            self.idf_[word] = np.log((1 + doc_count) / (1 + self.idf_[word])) + 1

        # Compute TF-IDF scores using the sorted vocabulary
        tfidf = []
        for doc in tf:
            doc_tfidf = np.zeros(len(self.vocab_))
            for word, value in doc.items():
                if word in self.vocab_:
                    index = self.vocab_[word]
                    doc_tfidf[index] = value * self.idf_[word]
            # L2 Normalization
            norm = np.linalg.norm(doc_tfidf)
            if norm > 0:
                doc_tfidf = doc_tfidf / norm
            tfidf.append(doc_tfidf)

        return np.array(tfidf)



In [None]:
# Initialize pipeline for text generation
generator = pipeline('text-generation', model ='EleutherAI/gpt-neo-1.3B')

# Define prompts for text generation
prompts = [
    "America's intervention in World War II changed the course of history and made it influential in all countries of the world",
    "The emergence of artificial intelligence is one of the most important factors in the development of life in the future",
    "Reading on various topics has an impact on forming the culture and awareness of young people"
]


# Example usage
documents = generate_documents(prompts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(documents)

[[{'generated_text': "America's intervention in World War II changed the course of history and made it influential in all countries of the world except the U.S.\n\nIn the name of the father who was killed, the son who was wounded and the daughter who was killed, American forces brought order to chaos and destruction. The Japanese who were brought into the war by American forces had no military leadership and were willing to use the power of government and force to achieve their political and economic goals.\n\nAmerican forces did not help the Japanese who were already in the war with little resistance. The forces that were brought into the war by Americans did not help the Japanese who had been fighting the Japanese for thousands of years.\n\nWorld War II was brought about by the forces of a nation whose leaders were determined to see an end to the war in Asia and who believed that Japan was their enemy. The forces of the United States who made war happen were determined to see an end 

In [None]:
c=1
for doc in documents:
  # Extract the string from the dictionary
  generated_text = doc[0]['generated_text']

  formatted_text = generated_text.replace('\n', ' ')
  sentences = formatted_text.split('. ')

  # Remove any leading or trailing whitespace from each sentence
  sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

  preprocessed_docs = [preprocess_document(sen) for sen in sentences]
  unique = [get_unique(doc) for doc in preprocessed_docs]

  custom_tfidf = CustomTFIDF()
  custom_tfidf_matrix = custom_tfidf.fit_transform(preprocessed_docs)

# Using sklearn for comparison
  tfidf_vectorizer = TfidfVectorizer()
  sklearn_tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs).toarray()

# Comparing results (simplified, for detailed comparison, iterate over matrices)
  print("------------------")
  print("doc",c)
  print("------------------")
  c+=1

  print("Custom TF-IDF vs. sklearn TF-IDF (first document vector):")
  print("Custom:", custom_tfidf_matrix[2])


  print("sklearn:", sklearn_tfidf_matrix[2])


------------------
doc 1
------------------
Custom TF-IDF vs. sklearn TF-IDF (first document vector):
Custom: [0.         0.         0.2479389  0.         0.         0.
 0.20057366 0.         0.         0.         0.         0.
 0.20057366 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.28553481 0.         0.         0.         0.         0.
 0.         0.         0.         0.27260606 0.28553481 0.28553481
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.18366827 0.
 0.         0.28553481 0.         0.         0.28553481 0.
 0.         0.         0.         0.         0.28553481 0.28553481
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.28553481 0.         0.
 0.12746559 0.         0.28553481 0.         0.         0.
 0.        ]
sklearn: [0.         0.         0.2479389  0.         0.         0.
 0.20057366 0.         0. 