In [43]:
# 이 코드에서는 가중치를 적용해 IR(Information Retrieval)을 수행하는 방법에 대해 알아봅니다. (TF, IDF)
collection = [
    ('Document1', 'This is a sample'),
    ('Document2', 'This is another sample')
]

In [44]:
# in-memory (Hash Key 값)
# 전체 색인어 목록(Dictionary)
# {단어1:포스팅위치, 단어2:포스팅위치, ...}
globalLexicon = dict()

# 전체 문서 목록(Dictionary)
# [0:문서1, 1:문서2, ...]
globalDocument = list()

# disk
# 사전에 있는 색인어 중, 어느 문서에서, 몇 번 나타났는지
# [(단어 idx, 문서 idx, 빈도, 다음주소), ...]
# [0:Tuple(lexiconIdx, documentIdx, freq, 다음포스팅위치-fptr)]
# 메모리 X, File OK
globalPosting = list()


In [45]:
for (docName, docContent) in collection:
    # Pointer 대체용, Key, Document 이름은 절대로 겹치지 않는다는 가정. 
    docIdx = len(globalDocument)
    globalDocument.append(docName)
    
    # {단어idx:빈도, 단어idx:빈도, ...}
    localPosting = dict()
    
    # Local 작업
    for term in docContent.lower().split():
        # Local에 대해서 수행한 후 없으면 새 posting으로 추가
        if term not in localPosting.keys():
            localPosting[term] = 1 # dict
        # 있으면, 빈도 증가
        else:
            localPosting[term] += 1
     
    # Global Marge
    # fp -> struct(단어, 빈도) (localPosting)
    for indexTerm, termFreq in localPosting.items(): # indexTerm : str,termFreq : int 
        if indexTerm not in globalLexicon.keys(): 
            lexiconIdx = len(globalLexicon)
            postingIdx = len(globalPosting) # fseek
            postingData = (lexiconIdx, docIdx, termFreq, -1)
            globalPosting.append(postingData)
            globalLexicon[indexTerm] = postingIdx # globalPosting 위치(ptr:idx)
        else: # 기존 단어의 idx 가져오기
            lexiconIdx = list(globalLexicon.keys()).index(indexTerm)
            postingIdx = len(globalPosting)
            beforeIdx = globalLexicon[indexTerm]
            postingData = (lexiconIdx, docIdx, termFreq, beforeIdx)
            globalPosting.append(postingData)
            globalLexicon[indexTerm] = postingIdx
            
#     print(localPosting)
# print(globalDocument)

#         if term not in globalLexicon.keys():
#             lexiconIdx = len(globalLexicon) 0

In [46]:
globalLexicon, globalDocument

({'this': 4, 'is': 5, 'a': 2, 'sample': 7, 'another': 6},
 ['Document1', 'Document2'])

In [47]:
globalPosting


[(0, 0, 1, -1),
 (1, 0, 1, -1),
 (2, 0, 1, -1),
 (3, 0, 1, -1),
 (0, 1, 1, 0),
 (1, 1, 1, 1),
 (4, 1, 1, -1),
 (3, 1, 1, 3)]

In [48]:
for indexTerm, postingIdx in globalLexicon.items():
    # indexTerm:단어: postingIdx:위치, ...
    print(indexTerm)
    
    while True: # Posting Next:-1
        if postingIdx == -1:
            break
            
        postingData = globalPosting[postingIdx]
        print('  DocName:{0} - TermFreq:{1} - Next:{2}'.format(globalDocument[postingData[1]], postingData[2], postingData[3]))
        postingIdx = postingData[3]
        
    print()

this
  DocName:Document2 - TermFreq:1 - Next:0
  DocName:Document1 - TermFreq:1 - Next:-1

is
  DocName:Document2 - TermFreq:1 - Next:1
  DocName:Document1 - TermFreq:1 - Next:-1

a
  DocName:Document1 - TermFreq:1 - Next:-1

sample
  DocName:Document2 - TermFreq:1 - Next:3
  DocName:Document1 - TermFreq:1 - Next:-1

another
  DocName:Document2 - TermFreq:1 - Next:-1



In [49]:
globalPosting[globalLexicon['sample']]


(3, 1, 1, 3)

In [50]:
globalPosting[globalPosting[globalLexicon['sample']][3]]   # 다음 주소가 "-1" 일때 까지 반복해서 찾음

(3, 0, 1, -1)

## TF-IDF

In [51]:
collection = [
    ('Document1', 'This is a a a a a a a a a a sample'),
    ('Document2', 'This is a sample'),
]

### TF

In [52]:
from math import log10

# TF를 수행하는 네 가지 방법 정의
def binaryTF(freq):
    if freq > 0:
        return 1
    else:
        return 0

def rawTF(freq):
    return freq

def basicTF(freq, totalFreq):
    return freq/totalFreq

def logTF(freq):
    if freq > 0:
        return 1+log10(freq)
    else:
        return 0

def doubleNormalTF(K, freq, maxFreq): 
    return K + ((1-K) * (freq/maxFreq))

In [53]:
for (docName, docContent) in collection:
    localPosting = dict()
    
    for term in docContent.lower().split():
        if term not in localPosting.keys():
            localPosting[term] = 1
        else:
            localPosting[term] += 1
    
    # localPosting => {단어:빈도, 단어:빈도, ...}
    
    maxFreq = max(localPosting.values())
    totalCount = sum(localPosting.values())
    
    print('-----------------------------------')
    
    for term, freq in localPosting.items():
        print(term)
        print('1. Binary:{0}'.format(binaryTF(freq)))
        print('2. Raw:{0}'.format(rawTF(freq)))
        print('3. Basic:{0}'.format(basicTF(freq, totalCount)))
        print('4. Log:{0}'.format(logTF(freq)))
        print('5. DoubleNormalization:{0}'.format(doubleNormalTF(0, freq, maxFreq)))
        print('6. DoubleNormalization:{0}'.format(doubleNormalTF(0.5, freq, maxFreq)))
        print()
    
    print(localPosting)
    print()

-----------------------------------
this
1. Binary:1
2. Raw:1
3. Basic:0.07692307692307693
4. Log:1.0
5. DoubleNormalization:0.1
6. DoubleNormalization:0.55

is
1. Binary:1
2. Raw:1
3. Basic:0.07692307692307693
4. Log:1.0
5. DoubleNormalization:0.1
6. DoubleNormalization:0.55

a
1. Binary:1
2. Raw:10
3. Basic:0.7692307692307693
4. Log:2.0
5. DoubleNormalization:1.0
6. DoubleNormalization:1.0

sample
1. Binary:1
2. Raw:1
3. Basic:0.07692307692307693
4. Log:1.0
5. DoubleNormalization:0.1
6. DoubleNormalization:0.55

{'this': 1, 'is': 1, 'a': 10, 'sample': 1}

-----------------------------------
this
1. Binary:1
2. Raw:1
3. Basic:0.25
4. Log:1.0
5. DoubleNormalization:1.0
6. DoubleNormalization:1.0

is
1. Binary:1
2. Raw:1
3. Basic:0.25
4. Log:1.0
5. DoubleNormalization:1.0
6. DoubleNormalization:1.0

a
1. Binary:1
2. Raw:1
3. Basic:0.25
4. Log:1.0
5. DoubleNormalization:1.0
6. DoubleNormalization:1.0

sample
1. Binary:1
2. Raw:1
3. Basic:0.25
4. Log:1.0
5. DoubleNormalization:1.0
6. Doub

### IDF

In [54]:
collection = [
    ('Document1', 'This a a a a a a a a a a  sample'),
    ('Document2', 'This is a sample'),
]

In [55]:
# IDF(Inverse Document Frequency)를 수행하기 위한 네 가지 방법 정의
def unaryIDF():
    return 1

def basicIDF(N, df):
    return log10(N/df)

def smoothigIDF(N, df):
    return log10((N+1)/df)

def probabilityIDF(N, df):
    return log10((N-df+1)/df)

In [56]:
N = len(collection)

for term, ptr in globalLexicon.items():
    # term:단어, ptr:위치, ...
    df = 0
    
    while True:    # ptr Next: -1
        if ptr == -1:
            break
        
        df += 1
        postingData = globalPosting[ptr]
        ptr = postingData[3]
    
    print(term)
    print('1. UnaryIDF: {0}'.format(unaryIDF()))
    print('2. BasicIDF: {0}'.format(basicIDF(N,df)))
    print('3. SmoothigIDF: {0}'.format(smoothigIDF(N,df)))
    print('4. ProbabilityIDF: {0}'.format(probabilityIDF(N,df)))
    print()

this
1. UnaryIDF: 1
2. BasicIDF: 0.0
3. SmoothigIDF: 0.17609125905568124
4. ProbabilityIDF: -0.3010299956639812

is
1. UnaryIDF: 1
2. BasicIDF: 0.0
3. SmoothigIDF: 0.17609125905568124
4. ProbabilityIDF: -0.3010299956639812

a
1. UnaryIDF: 1
2. BasicIDF: 0.3010299956639812
3. SmoothigIDF: 0.47712125471966244
4. ProbabilityIDF: 0.3010299956639812

sample
1. UnaryIDF: 1
2. BasicIDF: 0.0
3. SmoothigIDF: 0.17609125905568124
4. ProbabilityIDF: -0.3010299956639812

another
1. UnaryIDF: 1
2. BasicIDF: 0.3010299956639812
3. SmoothigIDF: 0.47712125471966244
4. ProbabilityIDF: 0.3010299956639812

