## Imports

In [1]:
import os, json
from datetime import datetime

from scripts.ollama_handler import OllamaMediaAnalysis
from scripts.file_analyzer import *

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

## Parameters

In [2]:
# Set model for ollama
# ollama.list() # to get all models
MODEL = "granite3.1-moe:3b-instruct-q8_0" #"granite3.1-moe" #"granite3.1-dense:8b-instruct-q8_0" #"granite3.1-dense:8b"
#SYSTEM_PROMPT = f"You are a senior researcher, working on a media analysis of articles published in arabic newspapers about ChatGPT and the effect of Artificial Intelligence on society. For your answers only focus on topics that were mentioned in the text without adding any further information. Before answering, thoroughly think about the task, the content provided and build your answer with chain of thought reasoning."
SYSTEM_PROMPT = (
    "You are a senior researcher conducting a media analysis of Arabic newspaper articles "
    "about ChatGPT and the societal effects of Artificial Intelligence. Your role is to focus "
    "exclusively on the topics mentioned in the provided text, without introducing external information. "
    "Before responding, carefully analyze the task, thoroughly evaluate the content of the articles, "
    "and construct your answer using a clear chain of thought reasoning approach."
)

# MAY TAKE LONG TIME! Whether all files should be processed (indicates Ollama interactions)
PROCESS_DOCUMENTS = False

# Load spacy model
SPACY_MODEL = "en_core_web_lg"


# Set folder paths
DOCUMENTS_FOLDER = "docs"
FILES_FOLDER = os.path.join(DOCUMENTS_FOLDER, "DOCX") # PDFs
OUTPUT_FOLDER = os.path.join(DOCUMENTS_FOLDER, "Processed", MODEL)
EXPORT_FILE_NAME = f"{datetime.now().strftime("%y%m%d")}-{MODEL}-processed_documents.pkl"

## Helper functions

In [3]:
def flatten_dict(d: dict):
    """Recursively flatten a dictionary with nested lists."""
    flattened_dict = dict()
    for k, v in d.items():
        flat_list = []
        if isinstance(v, list):
            flat_list.extend(flatten_list(v))
        elif isinstance(v, dict):
            flat_list.extend(flatten_dict(v))
        
        flattened_dict[k] = flat_list
    return flattened_dict

def flatten_list(l: list):
    """Recursively flatten a nested list."""
    flat_list = []
    for item in l:
        if isinstance(item, list):
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

## Initialize File Analyzer
Handes all files, folder processing, text extraction, question answering

In [4]:
# Define questions for the analyzer
questions =  [
    "How do the media in this article frame the public discussion about ChatGPT? Are there certain **metaphors** that keep cropping up?",
    "Which role does or might the Arabic World play in the development of Artificial Intelligence? Answer with 'Not mentioned' if not applicable.",
    "Which use cases of Artificial Intelligence are helpful for the Arabic world based on this article?",
    "What is the final message of the article that the author wants to convey? Keep your answer short and precise!"]

In [5]:
# Initialize llm as instance of OllamaMediaAnalysis
llm = OllamaMediaAnalysis(model_name=MODEL, system_prompt=SYSTEM_PROMPT, debug=True)

# Initialize analyzer
analyzer = FileAnalyzer(ollama_handler=llm, entity_collection="spacy", spacy_model=SPACY_MODEL, file_name=EXPORT_FILE_NAME, output_folder=OUTPUT_FOLDER, questions=questions, debug=False, speed_debug=False)

In [6]:
if PROCESS_DOCUMENTS:
    # Process the documents in the folder where the PDFs are
    documents = analyzer.process_folder(FILES_FOLDER, file_types=(".docx"))

    # Save documents to the output folder
    analyzer.save_documents(documents)

In [7]:
# Load already analyzed documents
analyzer.load_documents(os.path.join(OUTPUT_FOLDER, EXPORT_FILE_NAME), load_latest=False)

21:23:17 Initialized Document: <ChatGPT AI grows more powerful as we become more predictable>
21:23:17 Initialized Document: <ChatGPT is the ‘Netscape moment’ for artificial intelligence’>
21:23:17 Initialized Document: <ChatGPT outperforms copywriters in STEP Conference’s outdoor adverts>
21:23:17 Initialized Document: <AI is not smarter than humans>
21:23:17 Initialized Document: <No need to demonize ChatGPT but AI regulation is a must>
21:23:17 Initialized Document: <Is the Arab world ready for the uncertain age of AI-powered web tools>
21:23:17 Initialized Document: <‘I am not here to take your job,’ ChatGPT tells Frankly Speaking host>
21:23:17 Initialized Document: <Will ChatGPT and AI have an impact on Saudi workforce productivity>


## Get insights of all documents

In [11]:
# Print a summary of every loaded file (Optional: with markings)
for doc in analyzer:
    #analyzer.create_wordcloud(doc,wordcloud_names=["markings", "content", "summary"])
    #doc.summary = llm.generate_summary(doc.content)
    #result = llm.analyze_sentiment(doc.content))
    #result = doc.sentiment

    #doc.sentiment = result.get("sentiment_value")
    #doc.sentiment_reason = result.get("sentiment_reason")
    
    print(doc.get_info(), end="\n"+"- "*50+"\n"*2)

Title: ChatGPT AI grows more powerful as we become more predictable
Short Summary: The article is about the rapid, widespread adoption and hype surrounding ChatGPT, an AI language model by OpenAI, which gained one million users within five days, signaling the start of the AI age. Despite its popularity, it's criticized for lacking human-like intelligence and ethical concerns arise regarding reinforcing ideologies.
Summary:
1. **Rapid Adoption**: The release of ChatGPT, an AI large language model owned by OpenAI, was met with unprecedented speed, attracting a million unique users within five days post-launch.
 
2. **Emergence of AI in Developing Markets**: Companies and governments worldwide have rapidly adopted AI tools like ChatGPT due to the eagerness for an AI-powered future.
3. **Predictability and AI Capabilities**: The technology, based on predictable human behavior analysis from large data sets, can generate content with educated guesses about trends in accessible data. However,

In [12]:
# OPTIONAL: Stores an updates version of the documents
# analyzer.save_documents(analyzer.all_documents)

### Export the files as word-docx and markdown files

In [13]:
# Export docx files with wordclouds
analyzer.export_docx_files()

In [14]:
# Write a markdown file for every document
analyzer.export_markdown_files()

## Analysis of all files

### Apply latent dirichlet allocation algorithm
Algorighm selects all topics out of the articles. LLM then adds a title that summarizes the topics into categories. 

Thereby, all different topics can be extracted out of **all** documents.

In [15]:
# Get the content of all documents
all_content_tokens = [doc.content_tokens for doc in analyzer.all_documents]


# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english', analyzer="word")
doc_term_matrix = vectorizer.fit_transform(all_content_tokens)

# Apply LDA
lda = LatentDirichletAllocation(n_components=20, learning_method="batch", random_state=42, n_jobs=-1)
lda.fit(doc_term_matrix)

In [16]:
# Function to generate unique topics
from pyexpat import model


def get_unique_topics(model, vectorizer, top_n=10):
    unique_topics = {}
    for idx, topic in enumerate(model.components_):
        # Get the top features for the topic
        top_features = tuple(vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1])
        
        # Use the tuple as a key to ensure uniqueness
        if top_features not in unique_topics:
            unique_topics[top_features] = idx

    return unique_topics

# Generate a title for each unique topic
def generate_topic_titles(llm, unique_topics):
    titles = {}
    
    for features, idx in unique_topics.items():
        # Create a prompt with the top features
        prompt = (
        "Generate a concise and meaningful title, exactly four words long, that summarizes the following features. "
        "The title should capture the main theme or topic of these features. "
        "Example outputs: 'Language Revolution', 'Shaping Future Technology Trends', 'Global Knowledge Network'. Features: "
        f"{', '.join(features)}"
        )
        
        # Use the LLM to generate a title
        titles[idx] = llm.ollama.generate(model=llm.model, prompt=prompt)["response"]
    return titles


# Get unique topics and their titles
unique_topics = get_unique_topics(lda, vectorizer, top_n=20)
topic_titles = generate_topic_titles(llm, unique_topics)


analyzer.analysis["LDA"] = {topic_titles[idx].replace('"', ""): topics for idx, topics in zip(topic_titles, unique_topics)}

# Print unique topics and their generated titles
for idx, (title, topics) in enumerate(analyzer.analysis["LDA"].items()):

    print(f"Topic {idx} - {title}:")
    print(f"Features: {', '.join(topics)}")

Topic 0 - Generative Intelligence: Global Information Impact & Learning Revolution:
Features: write, generative, look, like, learning, learn, lead, large, know, internet, information, include, impact, help, government, google, generate, world, explain, development
Topic 1 - Microsoft Tech Innovation: Timely Content Development & Learning Advancements:
Features: technology, like, help, time, release, world, ability, understand, negative, microsoft, come, content, learn, include, believe, task, develop, look, know, information
Topic 2 - OpenAI's Revolutionary Large Model: Powering Technological Transformation Worldwide:
Features: datum, technology, development, ability, use, work, change, openai, world, new, large, produce, model, time, like, paper, internet, machine, decision, power
Topic 3 - Transforming Global Learning & Understanding through Tech:
Features: generate, question, write, new, technology, raise, world, report, work, content, develop, produce, like, know, understand, lead,

### Apply Term frequency inverse term frequency
This model iterates over each document and returns those words, that do not appear often in other documents. The top n words are then used to create a topic for every article!

- A high TF-IDF score (FROM_LOW_TO_HIGH = False) indicates that a word is both important within a document and rare across all document.
- A low TF-IDF score suggests that a word is either common in the document but rare overall, or vice versa.

By analyzing TF-IDF scores for a set of words, you can identify:
Important keywords in a document
Rare or unique words that distinguish one document from another
Words with varying levels of importance across different documents


In [17]:
# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(stop_words='english', norm="l2", analyzer="word", min_df=3)

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(all_content_tokens)

# Get the feature names (i.e., words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for better readability
df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Show the TF-IDF values for each term in each document
#print("TF-IDF Matrix:")
#print(df)

# Display the most important words (top N) for each document
TOP_N = 15
FROM_LOW_TO_HIGH = False

analyzer.analysis["TFIDF"] = dict()

for i, row in df.iterrows():
    doc = analyzer.all_documents[i]
    print(f"\nTop {TOP_N} terms for Document {i + 1}: <{doc.title}>")
          
    # Generate a title for each unique topic
    top_terms = row.sort_values(ascending=FROM_LOW_TO_HIGH).head(TOP_N*2)
    
    indices = top_terms.index
    values = top_terms.values
    
    # Create a prompt with the top features
    prompt = (
        "Generate a concise and meaningful title, exactly four words long, that summarizes the following features. "
        "The title should capture the main theme or topic of these features. "
        "Example outputs: 'Language Revolution', 'Shaping Future Technology Trends', 'Global Knowledge Network'. Features: "
        f"{'\n'.join([f'{str(indices[i])} - {values[i]}' for i in range(len(indices))])}"
        )
            
    # Use the LLM to generate a title
    title = llm.ollama.generate(model=llm.model, prompt=prompt)["response"]
    
    analyzer.analysis["TFIDF"].update({doc: {"title": title, "terms": top_terms[:TOP_N]}})
    
    print(title, "\n", ", ".join(top_terms[:TOP_N].index))


Top 15 terms for Document 1: <ChatGPT AI grows more powerful as we become more predictable>
"AI-Powered Language Development: Transforming Society's Future in Tech" 
 ai, chatgpt, human, tool, platform, development, language, technology, datum, use, work, change, openai, society, lack

Top 15 terms for Document 2: <ChatGPT is the ‘Netscape moment’ for artificial intelligence’>
"ChatGPT: Revolutionizing Language Research and Application in Saudi Technology" 
 model, ai, example, text, train, word, saudi, technology, chatgpt, generate, new, application, explore, engage, enable

Top 15 terms for Document 3: <ChatGPT outperforms copywriters in STEP Conference’s outdoor adverts>
"AI-Driven Chatbot: Revolutionizing Job Roles, Team Collaboration & Content Creation" 
 ai, team, job, chatbot, plan, tool, create, like, chatgpt, use, explain, need, think, human, company

Top 15 terms for Document 4: <AI is not smarter than humans>
"AI-Powered Chatbot: Revolutionizing Business, Marketing, and Use

In [18]:
content = json.dumps({doc.title: doc.content for doc in analyzer})

# Iterates over each question, provides answers to LLM and let them summarize
for question in questions:
    content = {doc.title: doc.answers.get(question) for doc in analyzer}
    response = llm.answer_question(text=json.dumps(content), question=question, multiple_articles=True)
    analyzer.analysis[question] = response

21:25:22	 Answering question <How do the media in this artic...>
21:25:32	 Answering question <Which role does or might the A...>
21:25:35	 Answering question <Which use cases of Artificial ...>
21:25:43	 Answering question <What is the final message of t...>


In [30]:
answers_questions = dict()
# Iterates over each question, provides answers to LLM and let them summarize
topic_question_all = (
    "Attached are the topics of every article. "
    "What **perspectives and aspects** are being widely covered? Which aspects are being ignored? "
    "In your answer consider topics such as, but not only, data privacy, costs/affordability, know-how, complexity, accuracy, accessibility, bias (towards age, gender, religion, sexuality), risks, opportunity, perception, limitations."
    "These are the topics of all arcticles: ")

# Example usage:
topic_clusters = {doc.title: [value for value in doc.topic_clusters.values()] for doc in analyzer}
content_topics = flatten_dict(topic_clusters)
response = llm.answer_question(text=json.dumps(content_topics), question=topic_question_all, multiple_articles=True)
analyzer.analysis["topic_question"] = response

19:39:13	 Answering question <Attached are the topics of eve...>


In [31]:
markdown = str()

lda = analyzer.analysis.get("LDA")
markdown += f"# Latent Dirichlet Allocation topics\n- {'\n- '.join(lda.keys())}\n\n"


tfidf = [elem.get("title") for elem in flatten_list(analyzer.analysis.get("TFIDF").values())]
markdown += f"# TDIF for every article\n- {'\n- '.join(tfidf)}\n\n"

markdown += "# Hypothesis/Questions\n"
for q_id, question in enumerate(questions + ["topic_question"]):
    answer = analyzer.analysis.get(question)
    markdown += f"## Question {q_id+1}\n"
    markdown += f"*{answer.get("question")}*\n{answer.get("answer")}\nReasoning: {answer.get("reasoning")}\n\n"
    
    
print(markdown)

markdown_path = os.path.join(analyzer.output_folder, "final_summary.md")
analyzer.export_markdown(markdown_path, markdown)

# Latent Dirichlet Allocation topics
- Tech-Driven Global Information Expansion
- Tech-Driven Knowledge Expansion: Microsoft's Latest Innovations
- AI Revolution: Transforming Datum, Technology, and Decision-Making
- Advancements in Knowledge Production and Dissemination
- Tech-Driven Data Revolution: Information Overhaul via Arabic Internet Model
- Revolutionizing Saudi Content Creation: Technology, Models & Impact
- Tech Giants' Innovation for Global Knowledge Access

# TDIF for every article
- "AI, ChatGPT, and Human Collaboration: Shaping the Language and Technology Future"
- "AI-Driven Language Model: Global Impact and Research Frontier"
- "AI-Driven Chatbot Development: The Future of Content Creation and Threat Mitigation"
- "AI-Powered Chatbot Technology: Revolutionizing Marketing and Business in 2023"
- "ChatGPT, AI in Education: Advancements and Concerns"
- "AI-Powered Chatbot Revolution: Global Impact in 2023"
- "AI-Driven Language Model: Transforming Journalism and News Accu