In [1]:
import pandas as pd
from collections import Counter

### Bag of words

In [2]:
documents = [
    "I love watching movies",
    "I enjoy reading books",
    "I love reading and watching movies"
]

#### prerpocessing sentences : lower case and tokenize

In [3]:
tokenized_documents = [doc.lower().split() for doc in documents]
tokenized_documents

[['i', 'love', 'watching', 'movies'],
 ['i', 'enjoy', 'reading', 'books'],
 ['i', 'love', 'reading', 'and', 'watching', 'movies']]

#### Calculate word frequencies for each document

In [4]:
word_frequencies = [Counter(doc) for doc in tokenized_documents] # 단어별 빈도 조사
word_frequencies

[Counter({'i': 1, 'love': 1, 'watching': 1, 'movies': 1}),
 Counter({'i': 1, 'enjoy': 1, 'reading': 1, 'books': 1}),
 Counter({'i': 1,
          'love': 1,
          'reading': 1,
          'and': 1,
          'watching': 1,
          'movies': 1})]

In [5]:
bow_df = pd.DataFrame(word_frequencies, index=["Document 1", "Document 2", "Document 3"]).fillna(0)
bow_df

Unnamed: 0,i,love,watching,movies,enjoy,reading,books,and
Document 1,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0
Document 2,1,0.0,0.0,0.0,1.0,1.0,1.0,0.0
Document 3,1,1.0,1.0,1.0,0.0,1.0,0.0,1.0


### TF-IDF

In [6]:
import pandas as pd
import numpy as np
from collections import Counter
import math



In [7]:
documents = [
    "I love watching movies",
    "I enjoy reading books",
    "I love reading and watching movies"
]

tokenized_documents = [doc.lower().split() for doc in documents]
tokenized_documents

[['i', 'love', 'watching', 'movies'],
 ['i', 'enjoy', 'reading', 'books'],
 ['i', 'love', 'reading', 'and', 'watching', 'movies']]

In [8]:
def term_frequency(doc):
    term_count = Counter(doc)
    
    total_terms = len(doc)
    
    return {term: count / total_terms for term, count in term_count.items()}

In [9]:
tf_values = [term_frequency(doc) for doc in tokenized_documents]
tf_values

[{'i': 0.25, 'love': 0.25, 'watching': 0.25, 'movies': 0.25},
 {'i': 0.25, 'enjoy': 0.25, 'reading': 0.25, 'books': 0.25},
 {'i': 0.16666666666666666,
  'love': 0.16666666666666666,
  'reading': 0.16666666666666666,
  'and': 0.16666666666666666,
  'watching': 0.16666666666666666,
  'movies': 0.16666666666666666}]

In [10]:
tf_df = pd.DataFrame(tf_values).fillna(0)
tf_df

Unnamed: 0,i,love,watching,movies,enjoy,reading,books,and
0,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0
1,0.25,0.0,0.0,0.0,0.25,0.25,0.25,0.0
2,0.166667,0.166667,0.166667,0.166667,0.0,0.166667,0.0,0.166667


#### Calculate inverse document frequency

In [11]:
def inverse_document_frequeny(docs):
    total_docs = len(docs)
    
    unique_terms = set(term for doc in docs for term in doc)
    print(unique_terms)
    term_doc_counts = {term: sum(1 for doc in docs if term in doc) for term in unique_terms}
    print(term_doc_counts)
    return {term : math.log(total_docs / count) for term, count in term_doc_counts.items()}

In [12]:
idf_values = inverse_document_frequeny(tokenized_documents)
idf_values

{'books', 'reading', 'movies', 'i', 'enjoy', 'love', 'watching', 'and'}
{'books': 1, 'reading': 2, 'movies': 2, 'i': 3, 'enjoy': 1, 'love': 2, 'watching': 2, 'and': 1}


{'books': 1.0986122886681098,
 'reading': 0.4054651081081644,
 'movies': 0.4054651081081644,
 'i': 0.0,
 'enjoy': 1.0986122886681098,
 'love': 0.4054651081081644,
 'watching': 0.4054651081081644,
 'and': 1.0986122886681098}

In [13]:
idf_df = pd.DataFrame([idf_values], index=["IDF"]).fillna(0)
idf_df

Unnamed: 0,books,reading,movies,i,enjoy,love,watching,and
IDF,1.098612,0.405465,0.405465,0.0,1.098612,0.405465,0.405465,1.098612


#### calculate TF_IDF

In [14]:
def tf_idf(tf, idf):
    return {term : tf_val * idf[term] for term, tf_val in tf.items()}

In [15]:
tf_idf_values = [tf_idf(tf, idf_values) for tf in tf_values]
tf_idf_values

[{'i': 0.0,
  'love': 0.1013662770270411,
  'watching': 0.1013662770270411,
  'movies': 0.1013662770270411},
 {'i': 0.0,
  'enjoy': 0.27465307216702745,
  'reading': 0.1013662770270411,
  'books': 0.27465307216702745},
 {'i': 0.0,
  'love': 0.06757751801802739,
  'reading': 0.06757751801802739,
  'and': 0.1831020481113516,
  'watching': 0.06757751801802739,
  'movies': 0.06757751801802739}]

In [16]:
tf_idf_df = pd.DataFrame(tf_idf_values, index=["Document 1", "Document 2", "Document 3"]).fillna(0)
tf_idf_df

Unnamed: 0,i,love,watching,movies,enjoy,reading,books,and
Document 1,0.0,0.101366,0.101366,0.101366,0.0,0.0,0.0,0.0
Document 2,0.0,0.0,0.0,0.0,0.274653,0.101366,0.274653,0.0
Document 3,0.0,0.067578,0.067578,0.067578,0.0,0.067578,0.0,0.183102
