In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from transformers import pipeline

# Initialize the text generation pipeline with the desired model
generator = pipeline('text-generation', model='gpt2')

# Generate text for the prompts
prompt1 = "machine learning"
generated_text1 = generator(prompt1, max_length=50)

prompt2 = "computer science"
generated_text2 = generator(prompt2, max_length=50)

print("Generated text for prompt 'machine learning':")
print(generated_text1)

print("\nGenerated text for prompt 'computer science':")
print(generated_text2)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text for prompt 'machine learning':
[{'generated_text': "machine learning in general — which is one of the major new areas that we will be talking about today. It's not new.\n\nI'm surprised this issue hasn't been up for discussion in a lot of places. It's certainly not new"}]

Generated text for prompt 'computer science':
[{'generated_text': 'computer science with data analysis will allow you to use the data in your analysis using various tools for creating real-time forecasts.\n\nA model like Hadoop can take any dataset and combine them to produce a "skeleton," with each dimension'}]


In [None]:
# Preprocessing function
def preprocess_document(document):
    document = document.lower()
    document = re.sub(r'[^a-zA-Z\s]', ' ', document)
    tokens = word_tokenize(document)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [None]:
class CustomTFIDF:
    def __init__(self):
        self.idf_ = {}
        self.vocab_ = {}

    def fit_transform(self, documents):
        tf = []
        doc_count = len(documents)

        # Compute term frequencies and document frequencies for IDF
        for document in documents:
            doc_tf = {}
            words = document.split()
            for word in words:
                doc_tf[word] = doc_tf.get(word, 0) + 1
            for word in doc_tf:
                doc_tf[word] = doc_tf[word] / len(words)
                self.idf_[word] = self.idf_.get(word, 0) + 1
            tf.append(doc_tf)

        # Sort the vocabulary alphabetically and assign indices
        sorted_vocab = sorted(self.idf_.keys())
        self.vocab_ = {word: idx for idx, word in enumerate(sorted_vocab)}

        # Compute IDF using the sorted vocabulary
        for word in self.idf_:
            self.idf_[word] = np.log((1 + doc_count) / (1 + self.idf_[word])) + 1

        # Compute TF-IDF scores using the sorted vocabulary
        tfidf = []
        for doc in tf:
            doc_tfidf = np.zeros(len(self.vocab_))
            for word, value in doc.items():
                if word in self.vocab_:
                    index = self.vocab_[word]
                    doc_tfidf[index] = value * self.idf_[word]
            # Normalization
            norm = np.linalg.norm(doc_tfidf)
            if norm > 0:
                doc_tfidf = doc_tfidf / norm
            tfidf.append(doc_tfidf)

        return np.array(tfidf)

In [None]:
# Extract generated texts
generated_text_prompt1 = generated_text1[0]['generated_text']
generated_text_prompt2 = generated_text2[0]['generated_text']

# Incorporating generated texts into documents along with prompts
documents = [prompt1, generated_text_prompt1, prompt2, generated_text_prompt2]

# Preprocess documents
preprocessed_docs = [preprocess_document(doc) for doc in documents]

# Using CustomTFIDF
custom_tfidf = CustomTFIDF()
custom_tfidf_matrix = custom_tfidf.fit_transform(preprocessed_docs)

# Using sklearn for comparison
tfidf_vectorizer = TfidfVectorizer()
sklearn_tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs).toarray()

In [None]:
print("Custom TF-IDF vs. sklearn TF-IDF:")
print("Custom:", custom_tfidf_matrix)


Custom TF-IDF vs. sklearn TF-IDF:
Custom: [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.70710678 0.         0.
  0.70710678 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.21203205 0.21203205 0.         0.
  0.         0.         0.         0.         0.21203205 0.
  0.21203205 0.         0.21203205 0.1671685  0.         0.21203205
  0.1671685  0.21203205 0.         0.63609615 0.21203205 0.21203205
  0.         0.         0.         0.         0.21203205 0.
  0.21203205 0.         0.21203205 0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.  

In [None]:
print("Custom TF-IDF vs. sklearn TF-IDF:")
print("sklearn:", sklearn_tfidf_matrix)

Custom TF-IDF vs. sklearn TF-IDF:
sklearn: [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.70710678 0.         0.
  0.70710678 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.21203205 0.21203205 0.         0.
  0.         0.         0.         0.         0.21203205 0.
  0.21203205 0.         0.21203205 0.1671685  0.         0.21203205
  0.1671685  0.21203205 0.         0.63609615 0.21203205 0.21203205
  0.         0.         0.         0.         0.21203205 0.
  0.21203205 0.         0.21203205 0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0. 