In [21]:
from collections import Counter
import math
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
 
generator=pipeline('text-generation',model='gpt2')


text1 = generator('accounting ', max_length=50)[0]['generated_text']
text2 = generator('Graphic designer', max_length=50)[0]['generated_text']
text3 = generator('mecanical engineering ', max_length=50)[0]['generated_text']
text4 = generator('artificial intelligence', max_length=50)[0]['generated_text']

documents = [text1 + " " + text2 + " " + text3 + " " + text4]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [23]:
print("Document 1:", text1)
print("Document 2:", text2)
print("Document 3:", text3)
print("Document 4:", text4)

Document 1: accounting  with a new password (or just have a new password that will not be reset on boot) is done in sudo userconfig.
sudo userconfig.sudo
# -e /users/test{}/dev/null
Document 2: Graphic designer: Jef Gifford

Jef Gifford created his design for the second season of "The Last Temptation of Christ." One of his favorites, it's a work of "literary fiction." When you
Document 3: mecanical engineering  in the US since 1992.  I was just beginning to learn about their processes of manufacture and what are they supposed to do.  I think the general rule of thumb for their work is that if you are going to
Document 4: artificial intelligence has shown it, or even that it was created as an experiment on behalf of a university in the US and thus not "real" work.

So what would you do, if you were a person who were "committed


In [24]:
# Step 2: Preprocess the documents
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def tokenize(text):
    tokens = []
    current_token = ""
    for char in text:
        if char.isalnum():  # Check if the character is alphanumeric
            current_token += char
        else:
            if current_token:
                tokens.append(current_token.lower())  # Add the token to the list
                current_token = ""
            if char.strip():  # Check if the character is not whitespace
                tokens.append(char)  # Add non-alphanumeric characters as separate tokens
    if current_token:
        tokens.append(current_token.lower())  # Add the last token if any
    return tokens

def preprocess_document(doc):
    # Remove punctuation
    doc = doc.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = tokenize(doc)
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stop words
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    return filtered_tokens

preprocessed_documents = [preprocess_document(doc) for doc in documents]


In [25]:


# Step 3: Calculate TF (Term Frequency)
tf_matrix = []
for doc_tokens in preprocessed_documents:
    doc_word_count = len(doc_tokens)
    word_freq = Counter(doc_tokens)
    tf = {word: word_freq[word] / doc_word_count for word in word_freq}
    tf_matrix.append(tf)

In [26]:
# Step 4: Calculate IDF (Inverse Document Frequency)
total_docs = len(preprocessed_documents)
word_doc_freq = {}
for doc_tokens in preprocessed_documents:
    for word in set(doc_tokens):
        word_doc_freq[word] = word_doc_freq.get(word, 0) + 1

idf = {word: math.log(total_docs / (word_doc_freq[word] + 1)) for word in word_doc_freq}

In [27]:

# Step 5: Multiply TF by IDF to get TFIDF
tfidf_matrix = []
for tf in tf_matrix:
    tfidf = {word: tf[word] * idf[word] for word in tf}
    tfidf_matrix.append(tfidf)

# Output TFIDF for each document
print("TFIDF for each document:")
for i, doc in enumerate(documents):
    print("Document", i+1, ":", tfidf_matrix[i])


TFIDF for each document:
Document 1 : {'accounting': -0.01050223000848402, 'new': -0.02100446001696804, 'password': -0.02100446001696804, 'reset': -0.01050223000848402, 'boot': -0.01050223000848402, 'done': -0.01050223000848402, 'sudo': -0.02100446001696804, 'userconfig': -0.01050223000848402, 'userconfigsudo': -0.01050223000848402, 'e': -0.01050223000848402, 'userstestdevnull': -0.01050223000848402, 'graphic': -0.01050223000848402, 'designer': -0.01050223000848402, 'jef': -0.02100446001696804, 'gifford': -0.02100446001696804, 'created': -0.02100446001696804, 'design': -0.01050223000848402, 'second': -0.01050223000848402, 'season': -0.01050223000848402, 'last': -0.01050223000848402, 'temptation': -0.01050223000848402, 'christ': -0.01050223000848402, 'one': -0.01050223000848402, 'favorite': -0.01050223000848402, 'work': -0.03150669002545206, 'literary': -0.01050223000848402, 'fiction': -0.01050223000848402, 'mecanical': -0.01050223000848402, 'engineering': -0.01050223000848402, 'u': -0.