In [1]:
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords    

text="I know one thing for certain: don't settle for less than what you're capable of, but strive for something bigger. Some of you reading this might identify with this message because it resonates with you on a deeper level. For others, at the end of their tether the message might be nothing more than a trivial pep talk. What I wish to convey irrespective of where you are in your journey is: NEVER settle for less. If you settle for less, you will receive less than you deserve and convince yourself you are justified to receive it. If you have not achieved the success you deserve and are considering giving up, will you regret it in a few years or decades from now? Only you can answer that, but you should carve out time to discover your motivation for pursuing your goals. It’s a fact, if you don’t know what you want you’ll get what life hands you and it may not be in your best interest, affirms author Larry Weidel: “Winners know that if you don’t figure out what you want, you’ll get whatever life hands you.” The key is to develop a powerful vision of what you want and hold that image in your mind. Nurture it daily and give it life by taking purposeful action towards it."
sentences = sent_tokenize(text) # NLTK 함수
total_documents = len(sentences)
print(total_documents)

9


In [None]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [None]:
#documents에 등장하는 단어들을 위해 테이블 생성
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [None]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [None]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [None]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [None]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [None]:
# 1 각 문장에 있는 단어의 빈도 행렬 생성
freq_matrix = _create_frequency_matrix(sentences)

# 2 TF 계산 및 행렬 생성
tf_matrix = _create_tf_matrix(freq_matrix)
print("\n\ntf:\n",tf_matrix)

# 3 documents에 등장하는 단어들을 위해 행렬 생성
count_doc_per_words = _create_documents_per_words(freq_matrix)

'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 4 IDF 계산 및 행렬 생성
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print("\n\nidf:\n", idf_matrix)

# 5 TF-IDF 계산 및 행렬 생성
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print("\n\ntf-idf:\n",tf_idf_matrix)

In [None]:
# sklearn에서 간단하게 실행하기

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [3]:
docA = "The car is driven on the road"
docB = "The truck is driven on the highway" 

In [4]:
response = tfidf.fit_transform([docA, docB])

feature_names = tfidf.get_feature_names_out()
for col in response.nonzero()[1]:
    print(feature_names[col], ' - ', response[0, col])

road  -  0.42471718586982765
on  -  0.30218977576862155
driven  -  0.30218977576862155
is  -  0.30218977576862155
car  -  0.42471718586982765
the  -  0.6043795515372431
highway  -  0.0
truck  -  0.0
on  -  0.30218977576862155
driven  -  0.30218977576862155
is  -  0.30218977576862155
the  -  0.6043795515372431
