# **INSTALL LIBRARY**

In [None]:
pip install Sastrawi



# **IMPORT LIBRARY**

In [None]:
import pandas as pd
import re 
import math 
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from google.colab import files, drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **PREPROCESSING**



**Case Folding**

In [None]:
def caseFolding(text):
  text = [i.lower() for i in text]
  return text

**Tokenisasi (Tokenization)**

In [None]:
def tokenisasi(text):
  text = [re.sub(r'([0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', i) for i in text]
  text = [i.split(' ') for i in text]
  return text

**Filtering**

In [None]:
def filtering(stopwords,text):
  with open(stopwords) as f:
    content = f.readlines()
  sw = [word.strip() for word in content]
  
  fltr=[]
  for i in range(len(text)):
    result=[]
    for j in range(len(text[i])):
      if not text[i][j] in sw:
        result.append(text[i][j])
    fltr.append(result)
  
  fltr = [" ".join(word) for word in fltr]
  return fltr

**Stemming**

In [None]:
def stemmingProcess(fltr):
  stem = []
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  
  for i in range(len(fltr)):
    hasilstem = stemmer.stem(fltr[i])
    stem.append(hasilstem)
    
  return stem

**Term Unik (Extract Unique Term)**

In [None]:
def uniqueTerm(text,hasil=[]):
  for i in text:
    for j in i.split(): 
      if j not in hasil:
        hasil.append(j)
  return hasil

**Term Frequency**

In [None]:
def termFrequency(text,feature):
  hasil = []
  text = [i.split() for i in text]
  for i in range(len(text)):
    hasil.append([])
    for j in range(len(feature)):
      hasil[i].append(text[i].count(feature[j]))
  return hasil

**Document Frequency**

In [None]:
def doctFrequency(text,feature):
  df = []
  text = [i.split() for i in text]
  for i in feature:
    jml = 0
    for j in text:
      if i in j:
        jml+=1
    df.append(jml)
  return df

# **BM25F Model**

***idf(t)***

In [None]:
def idfProcess(dfResult,nDoc = 0):
  idfResult = []
  for i in dfResult:
    idfResult.append(math.log((nDoc-i+0.5)/(i+0.5),10))
  return idfResult

**lc & avlc**

In [None]:
def lcProcess(doc):
  splitedDoc = [i.split() for i in doc]
  total_lc = 0
  each_lc = []
  for i in splitedDoc:
    each_lc.append(len(i))
    total_lc+= len(i)
  avg_lc = total_lc / len(splitedDoc)
  return each_lc, avg_lc

**𝑤𝑒𝑖𝑔ℎ𝑡(𝑡,𝑑)**

In [None]:
def weight_td(titleQuery,contenctQuery,boost_title,boost_content,bc,total_lc_title, avg_lc_title,total_lc_content, avg_lc_content):
  weighted_result = []
  for i in range(len(titleQuery)):
    weighted_result.append([])
    for j in range(len(titleQuery[i])):
      title = (titleQuery[i][j] * boost_title) / ((1 - bc) + bc * (total_lc_title[i] / avg_lc_title))
      content = (contenctQuery[i][j] * boost_content) / ((1 - bc) + bc * (total_lc_content[i] / avg_lc_content))
      result = title + content
      weighted_result[i].append(result)
  return weighted_result

**Rangking 𝑹(𝒒,𝒅)**

In [None]:
#BM25F
def rangking(idf,k1,weighted):
  rank = []
  for i in range(len(weighted)):
    sum = 0
    for j in range(len(weighted[i])):
      sum += idf[j] * (weighted[i][j] / (k1+weighted[i][j]))
    rank.append([sum,'D'+str(i+1)])
  return rank

# **Main Code**

In [None]:
#1. Read Dataset
df = pd.read_excel('...')
stopwords = '...'
query = '...'

#2. Input Parameter
boost_title = 5
boost_content = 2
bc = 0.75
k1 = 1.2

#3. Split Between Title & The Content
title = df['Judul'].values.tolist()
content = df['Isi'].values.tolist()

#4.1 Preprocessing Both title & content
#4.1.1 Case Folding
title_cf = caseFolding(title)
content_cf = caseFolding(content)
#4.1.2 Tokenisasi
title_token = tokenisasi(title_cf)
content_token = tokenisasi(content_cf)
#4.1.3 Filtering
title_filter = filtering(stopwords,title_token)
content_filter = filtering(stopwords,content_token)
#4.1.4 Stemming
title_stemming = stemmingProcess(title_filter)
content_stemming = stemmingProcess(content_filter)
#4.1.5 Unique Term
#Only contain unique term on title
title_uniqueTerm = uniqueTerm(title_stemming)
#Contain unique term both on title & content
content_uniqueTerm = uniqueTerm(content_stemming,title_uniqueTerm)
#4.1.6 Term Frequency
title_tf = termFrequency(title_stemming,content_uniqueTerm)
content_tf = termFrequency(content_stemming,content_uniqueTerm)

#4.2 Preprocessing the query
#4.2.1 Case Folding
cfQuery = caseFolding([query])
#4.2.2 Tokenisasi
tQuery = tokenisasi(cfQuery)
#4.2.3 Filtering
fQuery = filtering(stopwords,tQuery)
#4.2.4 Stemming
sQuery = stemmingProcess(fQuery)
#4.2.5 Unique Term
uniqueQuery = uniqueTerm(sQuery,[])
#4.2.6 Term Frequency
titleQuery = termFrequency(title_stemming,uniqueQuery)
contenctQuery = termFrequency(content_stemming,uniqueQuery)

#5. BM25F Model
#5.1 Document Frequency (df) process
dfContent = doctFrequency(content_stemming,uniqueQuery)

#5.2 Inverese Document Frequency (idf) process
idf = idfProcess(dfContent,len(content))

#5.3 Count lc & avlc
total_lc_title, avg_lc_title = lcProcess(title_stemming)
total_lc_content, avg_lc_content = lcProcess(content_stemming)

#5.4 Count weighted(t,d)
weighted = weight_td(titleQuery,contenctQuery,boost_title,boost_content,bc,total_lc_title, avg_lc_title,total_lc_content, avg_lc_content)

#5.5 Count R(q,D) process
rank = rangking(idf,k1,weighted)
print(rank)

#5.6 Sorting rank by value
rank.sort(key=lambda x:x[0], reverse=True)
print(rank)

[[0.9922150350082405, 'D1'], [0.22026048122479064, 'D2'], [0.09275772989523397, 'D3'], [0.1280222143923306, 'D4'], [0.0, 'D5']]
[[0.9922150350082405, 'D1'], [0.22026048122479064, 'D2'], [0.1280222143923306, 'D4'], [0.09275772989523397, 'D3'], [0.0, 'D5']]
