In [1]:
import pandas as pd
import json
import openai
import os

In [2]:
def get_conflict_analysis(text1, text2, client):
    prompt = '''Compare these two text excerpts and analyze if they contain conflicting information. 
Respond in this EXACT JSON format (no markdown, no additional text):
{{
    "conflict": boolean,
    "conflicting_points": [list of specific conflicting aspects],
    "explanation": "short paragraph explaining contradictions"
}} 

Text 1: {text1}

Text 2: {text2}'''

    try:
        response = client.chat.completions.create(
            model='Meta-Llama-3.3-70B-Instruct',
            messages=[
                {"role": "system", "content": "You are an expert analyst comparing documents for contradictions."},
                {"role": "user", "content": prompt.format(text1=text1, text2=text2)}
            ],
            temperature=0.0,
        )
        
        result = json.loads(response.choices[0].message.content.strip())
        return result
        
    except Exception as e:
        print(f"Error in conflict analysis: {e}")
        return {"conflict": False, "conflicting_points": [], "explanation": ""}

In [3]:
def cross_reference_documents(wikileaks, news, api_key):
    client = openai.OpenAI(
        api_key=api_key,
        base_url="https://api.sambanova.ai/v1",
    )

    i = 1
    # Create combined document list
    documents = []
    for _, row in wikileaks.iterrows():
        documents.append(('wikileaks', row['PDF Path'], row['Text']))
    
    for _, row in news.iterrows():
        documents.append(('news', row['Link'], row['Text']))

    # Process comparisons
    results = []
    for source1, id1, text1 in documents:
        
        conflicts = []
        conflict_count = 0
        
        for source2, id2, text2 in documents:
            if id1 == id2:  # Skip self-comparison
                continue

            print(i, end="...\n")
            i += 1
            analysis = get_conflict_analysis(text1, text2, client)
            if analysis['conflict']:
                conflict_count += 1
                conflict_entry = {
                    'conflicting_doc_id': id2,
                    'source': source2,
                    'points': analysis['conflicting_points'],
                    'explanation': analysis['explanation']
                }
                conflicts.append(conflict_entry)

        # Calculate reliability score
        max_possible_conflicts = len(documents) - 1
        reliability = max(0, 100 - (conflict_count / max_possible_conflicts * 100)) if max_possible_conflicts > 0 else 100
        
        results.append({
            'source': source1,
            'doc_id': id1,
            'text': text1,
            'reliability_score': round(reliability, 2),
            'conflict_count': conflict_count,
            'conflicts': json.dumps(conflicts)
        })

    return pd.DataFrame(results)

In [4]:
API_KEY = '6ff8fb71-a9d0-474c-8404-50b2e8217db9'

In [5]:
# Load datasets
wikileaks = pd.read_excel('dataset/wikileaks_parsed.xlsx')
news = pd.read_excel('dataset/news_excerpts_parsed.xlsx')

# Preprocess by grouping similar documents
wikileaks_processed = wikileaks.groupby('PDF Path').agg({'Text': ' '.join}).reset_index()
news_processed = news.groupby('Link').agg({'Text': ' '.join}).reset_index()

In [6]:
wikileaks_data, news_data = wikileaks_processed.head(3), news_processed.head(3)

In [7]:
results_df = cross_reference_documents(wikileaks_data, news_data, API_KEY)

1...
2...
3...
4...
5...
6...
7...
8...
9...
10...
11...
12...
13...
14...
15...
16...
17...
18...
19...
20...
21...
22...
23...
24...
25...
26...
27...
28...
29...
30...


In [8]:
results_df

Unnamed: 0,source,doc_id,text,reliability_score,conflict_count,conflicts
0,wikileaks,1.pdf,Pristina Airport – Possible administrative irr...,80.0,1,"[{""conflicting_doc_id"": ""http://edition.cnn.co..."
1,wikileaks,10.pdf,"""An interoffice memorandum providing an “outst...",80.0,1,"[{""conflicting_doc_id"": ""http://edition.cnn.co..."
2,wikileaks,105.pdf,"""Description\n\nThis is a Secret US National S...",100.0,0,[]
3,news,http://edition.cnn.com/2011/WORLD/asiapcf/09/0...,Turkey's fiery prime minister ratcheted up rap...,100.0,0,[]
4,news,https://apnews.com/article/congo-north-kivu-it...,"GOMA, Congo (AP) — One of eastern Congo’s most...",80.0,1,"[{""conflicting_doc_id"": ""105.pdf"", ""source"": ""..."
5,news,https://apnews.com/article/japan-south-north-k...,"In 2019, Japan’s government tightened export c...",100.0,0,[]
