In [16]:
!pip install openai

In [None]:
!pip install pickle

In [None]:
!pip install tiktoken

In [1]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [2]:
openai.api_key = ''

In [33]:
df = pd.read_csv('dataset_dg_token_terbaru.csv')

In [34]:
df

Unnamed: 0,kategori,tanggal_publikasi,judul_berita,tokens
0,Kesra,2018-12-31,574 Pasangan Ikuti Itsbat dan Nikah Massal,23
1,Jakarta Hari Ini,2018-12-31,Sudinsos Jakut Kerahkan 68 P3S di Pusat Kerama...,36
2,Pemerintahan,2018-12-31,Anies Lantik Tiga Pejabat Pimpinan Tinggi Pratama,32
3,Jakarta Hari Ini,2018-12-31,Kelurahan Angke Terjunkan 60 Petugas PPSU di R...,39
4,Kesra,2018-12-31,Warga Marunda Himpun Bantuan Bagi Korban Tsunami,27
5,Jakarta Hari Ini,2018-12-31,Sudinsos Jaktim Dirikan Tenda untuk Korban Keb...,37
6,Wisata & Kuliner,2018-12-31,Festival Kuliner Sambut Tahun Baru di Pulau La...,29
7,Lintas Kota,2018-12-31,Bupati Minta Aktivitas Menyelam Wisatawan Dida...,36
8,Olahraga,2018-12-31,Pemkot Jakut Apresiasi Atlet Berprestasi,29
9,Lintas Kota,2018-12-31,Sudah 60 Meter Kubik Material Vulkanik Dibersi...,28


In [36]:
df = df.set_index(["kategori", "tanggal_publikasi"])
print(f"{len(df)} rows in the data.")

39 rows in the data.


In [37]:
df = df.loc[~df.index.duplicated(keep='first')]

In [38]:
df.isna()

Unnamed: 0_level_0,Unnamed: 1_level_0,judul_berita,tokens
kategori,tanggal_publikasi,Unnamed: 2_level_1,Unnamed: 3_level_1
Kesra,2018-12-31,False,False
Jakarta Hari Ini,2018-12-31,False,False
Pemerintahan,2018-12-31,False,False
Wisata & Kuliner,2018-12-31,False,False
Lintas Kota,2018-12-31,False,False
Olahraga,2018-12-31,False,False
Transportasi,2018-12-30,False,False
Jakarta Hari Ini,2018-12-30,False,False
Kesra,2018-12-30,False,False
Lintas Kota,2018-12-30,False,False


In [39]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.judul_berita) for idx, r in df.iterrows()
    }

In [40]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "kategori" and c != "tanggal_publikasi"])
    return {
           (r.kategori, r.tanggal_publikasi): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [41]:
document_embeddings = load_embeddings("berita_tokens_embeddings6_newest.csv")

In [42]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:6]}... ({len(example_entry[1])} entries)")

('Kesra', '2018-12-31') : [-0.0788770169019699, 0.0463710688054561, -0.0741989165544509, -0.0136198224499821, -0.0959782302379608, -0.0193649847060441]... (1536 entries)


In [43]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [44]:
order_document_sections_by_query_similarity("ada apa di jakarta?", document_embeddings)[:5]

[(0.141711241529013, ('Kesra', '2018-12-31')),
 (0.11178209143510205, ('Kesra', '2018-12-30')),
 (0.08045336830871344, ('Transportasi', '2018-12-30')),
 (0.07330836887367856, ('Seni & Budaya', '2018-12-29')),
 (0.07015400515558455, ('Kesehatan', '2018-12-29'))]

In [45]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [46]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.judul_berita.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Jawab pertanyaan sejujur mungkin menggunakan konteks yang disediakan, dan jika jawabannya tidak terdapat dalam teks di bawah ini, katakan "Saya tidak tahu."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [47]:
prompt = construct_prompt(
    "ada apa di jakarta",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 14 document sections:
('Kesra', '2018-12-31')
('Kesra', '2018-12-30')
('Transportasi', '2018-12-30')
('Seni & Budaya', '2018-12-29')
('Kesehatan', '2018-12-29')
('Lintas Kota', '2018-12-30')
('Lintas Kota', '2018-12-28')
('Ekonomi', '2018-12-30')
('Wisata & Kuliner', '2018-12-31')
('Lintas Kota', '2018-12-31')
('Jakarta Hari Ini', '2018-12-28')
('Olahraga', '2018-12-31')
('Jakarta Hari Ini', '2018-12-29')
('Jakarta Hari Ini', '2018-12-31')
===
 Jawab pertanyaan sejujur mungkin menggunakan konteks yang disediakan, dan jika jawabannya tidak terdapat dalam teks di bawah ini, katakan "Saya tidak tahu."

Context:

* 574 Pasangan Ikuti Itsbat dan Nikah Massal
* Pemprov DKI Adakan Berbagai Kegiatan di Malam Tahun Baru
* Ini Rekayasa Lalin Saat Malam Pergantian Tahun 
*  Bamus Betawi Jakut Gelar Pagelaran Seni Budaya 
* Ada Waroeng Sehat di Puskesmas Kecamatan Johar Baru
* Karang Taruna Pulau Pari dan Tidung Galang Donasi Bagi Korban Tsunami
* Bupati Sambut Baik Peresmian Dermaga Sand

In [48]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [49]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [50]:
answer_query_with_context("apa yang terjadi di jakarta?", df, document_embeddings)

Selected 14 document sections:
('Kesra', '2018-12-31')
('Kesra', '2018-12-30')
('Seni & Budaya', '2018-12-29')
('Kesehatan', '2018-12-29')
('Lintas Kota', '2018-12-30')
('Transportasi', '2018-12-30')
('Lintas Kota', '2018-12-28')
('Wisata & Kuliner', '2018-12-31')
('Lintas Kota', '2018-12-31')
('Ekonomi', '2018-12-30')
('Jakarta Hari Ini', '2018-12-28')
('Olahraga', '2018-12-31')
('Jakarta Hari Ini', '2018-12-29')
('Jakarta Hari Ini', '2018-12-30')


'Ada 574 pasangan yang mengikuti itsbat dan nikah massal, Pemprov DKI mengadakan berbagai kegiatan di malam tahun baru, Bamus Betawi Jakut menggelar pagelaran seni budaya, ada waroeng sehat di Puskesmas Kecamatan Johar Baru, Karang Taruna Pulau Pari dan Tidung menggalang donasi bagi korban tsunami, ada rekayasa lalin saat malam pergantian tahun, Bupati menyambut baik peresmian dermaga sandar kapal KPP Jakarta, festival kuliner menyambut tahun baru di Pulau Lancang, Bupati minta aktivitas menyelam wisatawan didampingi pemandu, Dinas KPKP menggelar penjualan telur murah di 15 lokasi, 285 personel gabungan di Jakpus melakukan apel persiapan malam tahun baru, Pemkot Jakut mengapresiasi atlet berprestasi, dan hingga saat ini 28.905 lampu LED SS telah terpasang di Jaktim. BPRD juga membuka layanan'

In [52]:
query = "dimanakah lampu LED dipasang?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 14 document sections:
('Kesra', '2018-12-31')
('Kesra', '2018-12-30')
('Transportasi', '2018-12-30')
('Seni & Budaya', '2018-12-29')
('Lintas Kota', '2018-12-30')
('Wisata & Kuliner', '2018-12-31')
('Kesehatan', '2018-12-29')
('Ekonomi', '2018-12-30')
('Lintas Kota', '2018-12-28')
('Lintas Kota', '2018-12-31')
('Jakarta Hari Ini', '2018-12-28')
('Jakarta Hari Ini', '2018-12-30')
('Jakarta Hari Ini', '2018-12-29')
('Pemerintahan', '2018-12-31')

Q: dimanakah lampu LED dipasang?
A: 28.905 Lampu LED SS telah terpasang di Jaktim.


In [54]:
query = "berapa orang yang mengikuti pernikahan masal ?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 14 document sections:
('Kesra', '2018-12-31')
('Kesra', '2018-12-30')
('Transportasi', '2018-12-30')
('Seni & Budaya', '2018-12-29')
('Lintas Kota', '2018-12-30')
('Kesehatan', '2018-12-29')
('Lintas Kota', '2018-12-31')
('Lintas Kota', '2018-12-28')
('Ekonomi', '2018-12-30')
('Wisata & Kuliner', '2018-12-31')
('Pemerintahan', '2018-12-31')
('Jakarta Hari Ini', '2018-12-28')
('Olahraga', '2018-12-31')
('Jakarta Hari Ini', '2018-12-31')

Q: berapa orang yang mengikuti pernikahan masal ?
A: 574 pasangan.
