In [136]:
import pandas as pd
import numpy as np
import statistics
import time

import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [137]:
news = pd.read_csv("news.csv")
news_content = pd.DataFrame(news[["id", "content"]])

In [138]:
news_content.drop_duplicates(subset="content", keep="first")
news_content_clean = pd.DataFrame(news_content.dropna())
news_content.columns

Index(['id', 'content'], dtype='object')

In [139]:
num_terms_per_document = []
num_sentence_per_document = []
document_id = []
terms_list = []
count = 0

# news_content_clean = news_content_clean.str.replace(r"[()\[\]{}]\"", "", regex=True)

for i in news_content_clean['content']:
    text = i.split()
    for j in text:
        terms_list.append(j)

    num_terms_per_document.append(len(text))

for i in news_content_clean['content']:
    sentence = sent_tokenize(i)
    num_sentence_per_document.append(len(sentence))

for i in news_content_clean["id"]:
    document_id.append(i)

mode_val = statistics.mode(terms_list)
mode_count = terms_list.count(mode_val)
document_id = set(document_id)
# =======================================================================================================================================================
print(f"Number of Documents: {len(num_terms_per_document)}\n")
print(f"Total terms: {len(terms_list)}")
print(f"Total unique terms: {len(np.unique(terms_list))}")
print(f"Mode term : \"{mode_val}\", amount = {mode_count}")
print(f"Mean number of terms per documents: {np.mean(num_terms_per_document)}")
print(f"Median number of terms per documents: {np.median(num_terms_per_document)}")
print(f"Max number of terms per documents: {max(num_terms_per_document)}")
print(f"Min number of terms per documents: {min(num_terms_per_document)} \n")

print(f"Total sentence: {sum(num_sentence_per_document)}")
print(f"Mean number sentence per document: {np.mean(num_sentence_per_document)}")
print(f"Median number sentence per document: {np.median(num_sentence_per_document)}")
print(f"Max number sentence per document: {max(num_sentence_per_document)}")
print(f"Min number sentence per document: {min(num_sentence_per_document)}")

Number of Documents: 14334

Total terms: 5524080
Total unique terms: 227401
Mode term : "yang", amount = 130056
Mean number of terms per documents: 385.3830054416074
Median number of terms per documents: 312.0
Max number of terms per documents: 4307
Min number of terms per documents: 33 

Total sentence: 352491
Mean number sentence per document: 24.591251569694432
Median number sentence per document: 18.0
Max number sentence per document: 565
Min number sentence per document: 2


In [140]:
unique_term = list(set(terms_list))
# print(len(unique_term))
# print(unique_term[:1000])

In [141]:
inverted_index = {}
for row in news_content_clean.itertuples(index=False):
    doc_id, content = row.id, row.content
    for term in set(content.split()):
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(doc_id)

In [142]:
# AND
def and_query(query_1, query_2):
    start_time = time.perf_counter()
    result_1 = inverted_index[query_1]
    result_2 = inverted_index[query_2]
    result = result_1 & result_2
    stop_time = time.perf_counter()
    elapsed_time = stop_time-start_time
    return result, elapsed_time, query_1, query_2

# OR
def or_query(query_1, query_2):
    start_time = time.perf_counter()
    result_1 = inverted_index[query_1]
    result_2 = inverted_index[query_2]
    result = result_1 | result_2
    stop_time = time.perf_counter()
    elapsed_time = stop_time-start_time
    return result, elapsed_time, query_1, query_2

# NOT
def not_query(query_1):
    start_time = time.perf_counter()
    result = inverted_index[query_1]
    result = document_id - result
    stop_time = time.perf_counter()
    elapsed_time = stop_time-start_time
    return result, elapsed_time, query_1

In [143]:
# AND query test
print(f"======> AND <======")

result, elapsed_time, q_1, q_2 = and_query("warga", "nasi")
print(f"Query: {q_1, q_2}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result, "\n")

result, elapsed_time, q_1, q_2 = and_query("pemerintah", "korupsi")
print(f"Query: {q_1, q_2}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result)


Query: ('warga', 'nasi')
Dokumen ditemukan: 17
Waktu Pencarian: 0.0000166 second 

Query: ('pemerintah', 'korupsi')
Dokumen ditemukan: 77
Waktu Pencarian: 0.0000422 second 



In [144]:
# OR query test
print("======> OR <======")
result, elapsed_time, q_1, q_2 = or_query("warga", "nasi")
print(f"Query: {q_1, q_2}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result, "\n")

result, elapsed_time, q_1, q_2 = or_query("pemerintah", "korupsi")
print(f"Query: {q_1, q_2}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result)

Query: ('warga', 'nasi')
Dokumen ditemukan: 2992
Waktu Pencarian: 0.0001577 second 

Query: ('pemerintah', 'korupsi')
Dokumen ditemukan: 3674
Waktu Pencarian: 0.0001829 second 



In [147]:
# NOT query test
print("======> NOT <======")
result, elapsed_time, q_1 = not_query("warga")
print(f"Query: {q_1}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result, "\n")

result, elapsed_time, q_1 = not_query("pemerintah")
print(f"Query: {q_1}")
print(f"Dokumen ditemukan: {len(result)}")
print(f"Waktu Pencarian: {format(elapsed_time, '.7f')} second \n")
# print(result)

Query: warga
Dokumen ditemukan: 11411
Waktu Pencarian: 0.0004186 second 

Query: pemerintah
Dokumen ditemukan: 10874
Waktu Pencarian: 0.0004741 second 

