# EXAMPLE OF LLM FOR BIAS COMPARISON AND SUMMARY GENERATION
## TO BE CONVERTED TO API USING FLASK

In [None]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document,
    VectorStoreIndex,
    SummaryIndex,
)

import tensorflow as tf
from tensorflow.keras import preprocessing
from keras.preprocessing.sequence import pad_sequences

sys.modules['keras.src.preprocessing'] = preprocessing

import pickle

from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

import nest_asyncio

nest_asyncio.apply()

In [2]:
df = pd.read_parquet('../data/sameExample_embedded_clustered.parquet.gzip')

In [3]:

load_dotenv('var.env')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

Settings.llm = OpenAI(temperature=0.5, model="chatgpt-4o-latest")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

llm = OpenAI(
    model="chatgpt-4o-latest"
)

In [4]:
# Bias Model
path = '../model/bias/'
with open(path+'bias_tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

interpreter = tf.lite.Interpreter(model_path=path+'bias_detection_lstm.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

def predict_bias(news_text):

    # Tokenisasi dan padding teks baru
    new_sequences = tokenizer.texts_to_sequences(news_text)
    max_len = 30  # Pastikan panjang maksimum sesuai dengan yang digunakan saat melatih model
    new_padded = pad_sequences(new_sequences, maxlen=max_len)

    # Konversi data input menjadi tipe float32
    new_padded = new_padded.astype('float32')

    # Mengatur tensor input dengan data yang dipadatkan
    interpreter.set_tensor(input_details[0]['index'], new_padded)

    # Menjalankan interpreter untuk membuat prediksi
    interpreter.invoke()

    # Mendapatkan hasil prediksi dari tensor output
    predictions_tflite = interpreter.get_tensor(output_details[0]['index'])

    # Menginterpretasikan hasil prediksi
    predicted_labels_tflite = [1 if pred > 0.5 else 0 for pred in predictions_tflite]

    return predicted_labels_tflite[0]


In [5]:
# Predict bias
df['bias'] = df['content'].apply(lambda x: predict_bias([x]))

In [6]:
# Hoax Model
path = '../model/hoax/'
with open(path+'tokenizer_A3.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

interpreter = tf.lite.Interpreter(model_path=path+'hoax_detection_A3.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

def predict_hoax(news_text):
    new_sequences = tokenizer.texts_to_sequences(news_text)
    max_len = 100
    new_padded = pad_sequences(new_sequences, maxlen=max_len)

    new_padded = new_padded.astype('float32')

    interpreter.set_tensor(input_details[0]['index'], new_padded)

    interpreter.invoke()

    predictions_tflite = interpreter.get_tensor(output_details[0]['index'])

    predicted_labels_tflite = [1 if pred > 0.5 else 0 for pred in predictions_tflite]

    return predicted_labels_tflite[0]

In [7]:
# Predict hoax
df['hoax'] = df['content'].apply(lambda x: predict_hoax([x]))

In [8]:
# Liberism_Conservative Model
path = '../model/liberalism_conservative/'
with open(path+'tokenizer_liberalism_conservative_A.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

interpreter = tf.lite.Interpreter(model_path=path+'liberalism_conservative_A.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

def predict_liberalism_conservative(news_text):
    new_sequences = tokenizer.texts_to_sequences(news_text)
    max_len = 100
    new_padded = pad_sequences(new_sequences, maxlen=max_len)

    new_padded = new_padded.astype('float32')

    interpreter.set_tensor(input_details[0]['index'], new_padded)

    interpreter.invoke()

    predictions_tflite = interpreter.get_tensor(output_details[0]['index'])

    predicted_labels_tflite = [1 if pred > 0.5 else 0 for pred in predictions_tflite]

    return predicted_labels_tflite[0]

In [9]:
#  Predict liberalism_conservative
df['liberalism_conservative'] = df['content'].apply(lambda x: predict_liberalism_conservative([x]))

In [10]:
df

Unnamed: 0,source,title,content,embedding,cluster,bias,hoax,liberalism_conservative
0,detik,"Polisi Tembak Polisi di Rumah Pejabat Polri, B...",Jakarta - Seorang polisi Brigadir J tewas dite...,"[-0.00010740216384874657, -0.00601372215896844...",2,0,1,1
1,cnnindonesia,Kronologi Polisi Tembak Polisi hingga Tewas di...,"Jakarta, CNN Indonesia -- Mabes Polri mengungk...","[-0.009914712980389595, -0.006208854261785746,...",2,1,1,1
2,ajnn,"Polisi Tembak Polisi, Brigadir J Tewas di Ruma...",JAKARTA - Polisi Brigadir J tewas akibat temba...,"[-0.006993312854319811, -0.008073685690760612,...",2,1,1,1
3,Suratpemred,Dor! Polisi Tembak Polisi di Rumah Dinas Pejab...,"JAKARTA, SP – Peristiwa penembakan sesama angg...","[0.0023585606832057238, -0.0064333160407841206...",2,0,0,1
4,Indozone,Fakta-fakta Polisi Adu Tembak di Rumah Petingg...,Peristiwa adu tembak antara anggota kepolisian...,"[-0.0008913218625821173, -0.001862582983449101...",2,0,1,1
5,TribuJabar,Polri Ungkap Penyebab 2 Polisi Baku Tembak di ...,"TRIBUNJABAR.ID, JAKARTA - Penyebab meninggalny...","[0.0053572701290249825, -0.009876349940896034,...",2,1,1,1
6,Detiknews,Polri Jelaskan Alasan Kasus Penembakan Brigadi...,Jakarta - Polri mengungkapkan alasan mengapa k...,"[-0.003241070080548525, -0.009468814358115196,...",2,1,1,1
7,Sindonews,"Brigadir Polisi Ditembak Bharada, Ini Penjelas...",JAKARTA - Kepala Biro Penerangan Masyarakat (K...,"[0.0016425038920715451, 0.0018388287862762809,...",2,1,1,1
8,Jpnn,Brigadir J Masuk Kamar dan Melecehkan Istri Ir...,"jpnn.com, JAKARTA SELATAN - Karopenmas Divhuma...","[-0.01566256396472454, -0.011813515797257423, ...",2,1,1,1
9,Cnnindonesia,Brigadir J Merupakan Sopir Istri Kadiv Propam ...,"Sementara, penembak Brigadir J, Bharada E adal...","[-0.018271414563059807, -0.016924822703003883,...",2,1,1,1


In [None]:
def create_documents(df):
    documents = []
    for index, row in df.iterrows():
        document = Document(
            text=row['content'],
            doc_id=str(index),
            metadata={
            'title': row['title'],
            'bias': 'biased' if row['bias'] == 1 else 'not biased',
            'hoax': 'hoax' if row['hoax'] == 1 else 'not hoax',
            'liberalism_conservative': 'liberalism' if row['liberalism_conservative'] == 1 else 'conservative'
            },
            embedding=row['embedding']
        )
        documents.append(document)
    return documents

documents = create_documents(df)

In [12]:
def createSummary(documents):
    summarizeQuery = """
    Create a short, detailed, and factual summary of the articles.
    For more context for you, chatgpt, information regarding: 
    it's bias (0: not biased/neutral, 1: biased), 
    hoax (0: is factual, 1: has hoax), 
    and whether it is liberal or conservative (0 is liberal, 1 is conservative)
    which are all detected using machine learning models are also included.
    Do NOT discuss about the article's hoax, bias, or political view. 
    Do NOT rely on previous knowledge. 
    Use Indonesian language. 
    """
    summary_index = SummaryIndex.from_documents(documents, use_async=True)
    summary_query_engine = summary_index.as_query_engine(llm=Settings.llm)

    summary = summary_query_engine.query(summarizeQuery)
    return summary

In [None]:
print(createSummary(documents))

In [13]:
Settings.llm = OpenAI(model='gpt-4o')

In [14]:
def queryEngineMaker(df):
    documents = create_documents(df)
    query_engine = VectorStoreIndex.from_documents(documents).as_query_engine(llm=Settings.llm)
    return query_engine

In [15]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=queryEngineMaker(df),
        metadata=ToolMetadata(
            name="news_articles",
            description="News Articles of the same topic with bias, hoax, and liberalism_conservative labelled.",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools
)

**********
Trace: index_construction
    |_CBEventType.NODE_PARSING -> 0.004502 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.001 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.001503 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
      |_CBEventType.CHUNKING -> 0.0 seconds
**********


In [20]:
compareQuery = """
You will act as a text analysis and comparison expert to help me analyze a collection of articles classified as liberal or conservative. These articles all address the same topic or event. Your task is to:

Identify linguistic patterns in liberal and conservative sources, analyzing key phrases, tone, and stylistic choices.
Compare how each perspective frames the topic, including their choice of focus, emotional appeals, or omitted details.
Summarize the themes unique to each perspective and explain how these differences reveal underlying priorities or ideological leanings.
Include direct quotes from the articles to illustrate your analysis and provide concrete examples.
Highlight specific instances of emotionally charged language, identifying how they shape the narrative or influence the reader's perception.
Present your findings in a clear and structured format, using sections or bullet points where appropriate. Do not use markdowns. Ensure the analysis is thorough and uses examples to make the findings actionable.

"""

In [21]:
response = query_engine.query(compareQuery)

Generated 10 sub questions.
[1;3;38;2;237;90;200m[news_articles] Q: What are the key phrases and stylistic choices used in liberal articles on the topic?
[0m[1;3;38;2;90;149;237m[news_articles] Q: What are the key phrases and stylistic choices used in conservative articles on the topic?
[0m[1;3;38;2;11;159;203m[news_articles] Q: How do liberal sources frame the topic, including their choice of focus and emotional appeals?
[0m[1;3;38;2;155;135;227m[news_articles] Q: How do conservative sources frame the topic, including their choice of focus and emotional appeals?
[0m[1;3;38;2;237;90;200m[news_articles] Q: What themes are unique to liberal perspectives on the topic?
[0m[1;3;38;2;90;149;237m[news_articles] Q: What themes are unique to conservative perspectives on the topic?
[0m[1;3;38;2;11;159;203m[news_articles] Q: What direct quotes from liberal articles illustrate their analysis and provide concrete examples?
[0m[1;3;38;2;155;135;227m[news_articles] Q: What direct quote

In [22]:
print(response)    

To analyze the collection of articles classified as liberal or conservative, we can break down the analysis into several key areas:

1. **Linguistic Patterns in Liberal Sources:**
   - **Key Phrases and Stylistic Choices:** Liberal articles often use factual reporting with detailed descriptions, including specific details about incidents such as time, location, and individuals involved. The narrative is straightforward, focusing on the sequence of events, quotes from official sources, and the ongoing investigation.
   - **Tone:** The tone is typically neutral and informative, emphasizing transparency and accountability.
   - **Direct Quotes:** For example, quotes from Brigjen Ahmad Ramadhan describe the sequence of events, highlighting actions like Brigadir J allegedly committing an inappropriate act and the subsequent confrontation.

2. **Linguistic Patterns in Conservative Sources:**
   - The context does not provide specific examples of linguistic patterns in conservative articles, 

In [23]:

# iterate through sub_question items captured in SUB_QUESTION event
from llama_index.core.callbacks import CBEventType, EventPayload

for i, (start_event, end_event) in enumerate(
    llama_debug.get_event_pairs(CBEventType.SUB_QUESTION)
):
    qa_pair = end_event.payload[EventPayload.SUB_QUESTION]
    print("Sub Question " + str(i) + ": " + qa_pair.sub_q.sub_question.strip())
    print("Answer: " + qa_pair.answer.strip())
    print("====================================")

Sub Question 0: What are the key phrases and stylistic choices used in liberal articles on the topic?
Answer: Liberal articles on the topic tend to use key phrases and stylistic choices that emphasize the procedural aspects of the investigation and the official statements from authorities. They often highlight the sequence of events, such as the timeline of the incident and the actions taken by the police. Phrases like "Mabes Polri membeberkan motif," "peristiwa baku tembak," and "kasus ini akan diusut" are used to convey the seriousness and the procedural nature of the investigation. Additionally, there is a focus on the roles and actions of the individuals involved, such as "Brigadir J melakukan tindakan pelecehan" and "Bharada E untuk melindungi diri," which provide a narrative of self-defense and official response. The articles also tend to include direct quotes from police officials to lend authority and credibility to the information presented.
Sub Question 1: What are the key ph