In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from transformers import pipeline
import re


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
def Cleaning(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z\s]', '', text)
    elif isinstance(text, list):
        return [re.sub(r'[^A-Za-z\s]', '', t) for t in text]
    else:
        raise ValueError("Input must be a string or a list of strings")

def Normalization(text):
    return text.lower()

def Tokenization(text):
    return word_tokenize(text)

def Lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

def get_unique_words(words):
    return list(set(words))

In [3]:
generator = pipeline("text-generation")

prompts = ["machine learning", "deep learning", "reinforcement"]
max_length = 70

generated_texts = []

for prompt in prompts:
    generated_text = generator(prompt, max_length=max_length, do_sample=False)[0]['generated_text']
    cleaned_text = Cleaning(generated_text)
    normalized_text = Normalization(cleaned_text)
    tokenized_text = Tokenization(normalized_text)
    lemmatized_text = Lemmatization(tokenized_text)
    without_stopwords_text = remove_stopwords(lemmatized_text)
    unique_words = get_unique_words(without_stopwords_text)
    generated_texts.append(unique_words)

print(generated_texts)

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[['create', 'learning', 'called', 'second', 'first', 'language', 'learningclass', 'step', 'learningclassclass', 'third', 'learn', 'class', 'feature', 'used', 'machine', 'new'], ['important', 'process', 'part', 'learning', 'first', 'understand', 'neural', 'step', 'network', 'responsible', 'neuron', 'connected', 'deep'], ['land', 'reinforcement', 'law', 'people']]


In [8]:
import math
from collections import Counter

def calculate_tf(document):
    # Calculate term frequency (TF) for each term in the document without normalization
    word_count = Counter(document)
    tf_values = dict(word_count)
    return tf_values

def calculate_idf(documents):
    N = len(documents)
    idf_values = {}
    for document in documents:
        unique_words = set(document)
        for word in unique_words:
            if word in idf_values:
                idf_values[word] += 1
            else:
                idf_values[word] = 1

    for word, nt in idf_values.items():
        idf_values[word] = math.log((1 + N) / (1 + nt)) + 1

    return idf_values

def calculate_tfidf(tf_values, idf_values):
    tfidf_values = {word: tf_values[word] * idf_values[word] for word in tf_values}
    return tfidf_values

def normalize_tfidf(tfidf_values):
    sum_squares = sum(value**2 for value in tfidf_values.values())
    normalized_tfidf = {word: value / math.sqrt(sum_squares) for word, value in tfidf_values.items()}
    return normalized_tfidf


tf_values_list = [calculate_tf(doc) for doc in generated_texts]

idf_values = calculate_idf(generated_texts)

tfidf_values_list = [calculate_tfidf(tf_values, idf_values) for tf_values in tf_values_list]

normalized_tfidf_list = [normalize_tfidf(tfidf_values) for tfidf_values in tfidf_values_list]

for i, text in enumerate(generated_texts):
    print(f"Text {i+1}:")
    tfidf_values = {term: tf_values_list[i][term] * idf_values[term] for term in tf_values_list[i]}
    print("Normalized TF-IDF values:")
    sum_squares = sum(value**2 for value in tfidf_values.values())
    normalized_tfidf_values = {term: value / math.sqrt(sum_squares) for term, value in tfidf_values.items()}
    print(normalized_tfidf_values)
    print()

Text 1:
Normalized TF-IDF values:
{'create': 0.26050856502746905, 'learning': 0.19812347842972075, 'called': 0.26050856502746905, 'second': 0.26050856502746905, 'first': 0.19812347842972075, 'language': 0.26050856502746905, 'learningclass': 0.26050856502746905, 'step': 0.19812347842972075, 'learningclassclass': 0.26050856502746905, 'third': 0.26050856502746905, 'learn': 0.26050856502746905, 'class': 0.26050856502746905, 'feature': 0.26050856502746905, 'used': 0.26050856502746905, 'machine': 0.26050856502746905, 'new': 0.26050856502746905}

Text 2:
Normalized TF-IDF values:
{'important': 0.29191390489196817, 'process': 0.29191390489196817, 'part': 0.29191390489196817, 'learning': 0.22200804888354078, 'first': 0.22200804888354078, 'understand': 0.29191390489196817, 'neural': 0.29191390489196817, 'step': 0.22200804888354078, 'network': 0.29191390489196817, 'responsible': 0.29191390489196817, 'neuron': 0.29191390489196817, 'connected': 0.29191390489196817, 'deep': 0.29191390489196817}

Tex

In [9]:
import math
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", stop_words='english')

flattened_texts = [' '.join(text) for text in generated_texts]

tfidf_matrix = tfidf_vectorizer.fit_transform(flattened_texts)

feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_matrix = tfidf_matrix.toarray()
for i, text in enumerate(generated_texts):
    print(f"Text {i+1}:")
    tfidf_values = {}
    for j, term in enumerate(feature_names):
        tfidf_value = tfidf_matrix[i][j]
        if tfidf_value > 0:
            tfidf_values[term] = tfidf_value
    print("TF-IDF:", tfidf_values)
    print()


Text 1:
TF-IDF: {'called': 0.2756924555034742, 'class': 0.2756924555034742, 'create': 0.2756924555034742, 'feature': 0.2756924555034742, 'language': 0.2756924555034742, 'learn': 0.2756924555034742, 'learning': 0.20967121850838127, 'learningclass': 0.2756924555034742, 'learningclassclass': 0.2756924555034742, 'machine': 0.2756924555034742, 'new': 0.2756924555034742, 'second': 0.2756924555034742, 'step': 0.20967121850838127, 'used': 0.2756924555034742}

Text 2:
TF-IDF: {'connected': 0.31377733704463995, 'deep': 0.31377733704463995, 'important': 0.31377733704463995, 'learning': 0.2386357525755203, 'network': 0.31377733704463995, 'neural': 0.31377733704463995, 'neuron': 0.31377733704463995, 'process': 0.31377733704463995, 'responsible': 0.31377733704463995, 'step': 0.2386357525755203, 'understand': 0.31377733704463995}

Text 3:
TF-IDF: {'land': 0.5, 'law': 0.5, 'people': 0.5, 'reinforcement': 0.5}

