# 4. 情報理論

### TFIDFの実装

In [1]:
import math
class TFIDF:
    
    def __init__(self, terms, documents):
        self.terms = terms
        self.documents = documents
    
    def get_idf(self):
        
        idfs = []
        for term in self.terms:
            doc_cnt = len([doc for doc in self.documents if term in doc])
            idfs.append(math.log(len(self.documents)/doc_cnt))
        return idfs

    
    def get_tf(self, document):
        
        term_cnt_list = [document.count(term) for term in terms]
        total_term_cnt = sum(term_cnt_list)
        return [term_cnt / total_term_cnt for term_cnt in term_cnt_list]

    
    def run(self):
        
        tfidfs = []
        for doc in self.documents:
            tfidf = []
            for tf, idf in zip(self.get_tf(doc), self.get_idf()):
                tfidf.append(tf*idf)
            tfidfs.append(tfidf)
        return tfidfs    


In [2]:
import pandas as pd

terms = ["Tokyo", "Osaka", "Nagoya"]
documents = ["Tokyo,Tokyo", "Tokyo,Osaka", "Osaka,Nagoya"]

tfidfs = TFIDF(terms, documents).run()
df = pd.DataFrame(tfidfs, index=documents, columns=terms)
df

Unnamed: 0,Tokyo,Osaka,Nagoya
"Tokyo,Tokyo",0.405465,0.0,0.0
"Tokyo,Osaka",0.202733,0.202733,0.0
"Osaka,Nagoya",0.0,0.202733,0.549306


Nagoyaは他の文章に出現していないため、TFIDF値が最大となっている。