## Imports

In [1]:
import os, json
from datetime import datetime

from scripts.ollama_handler import OllamaMediaAnalysis
from scripts.file_analyzer import *

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

## Parameters

In [65]:
# Set model for ollama
# ollama.list() # to get all models
MODEL = "granite3.1-moe:3b-instruct-q8_0" #"granite3.1-moe" #"granite3.1-dense:8b-instruct-q8_0" #"granite3.1-dense:8b"
#SYSTEM_PROMPT = f"You are a senior researcher, working on a media analysis of articles published in arabic newspapers about ChatGPT and the effect of Artificial Intelligence on society. For your answers only focus on topics that were mentioned in the text without adding any further information. Before answering, thoroughly think about the task, the content provided and build your answer with chain of thought reasoning."

SYSTEM_PROMPT = (
    "You are a senior researcher conducting a media analysis of Arabic newspaper articles "
    "about ChatGPT and the societal effects of Artificial Intelligence. Your role is to focus "
    "exclusively on the topics mentioned in the provided text, without introducing external information. "
    "Before responding, carefully analyze the task, thoroughly evaluate the content of the articles, "
    "and construct your answer using a clear chain of thought reasoning approach."
)

# MAY TAKE LONG TIME! Whether all files should be processed (indicates Ollama interactions)
PROCESS_DOCUMENTS = False

# Load spacy model
SPACY_MODEL = "en_core_web_lg"


# Set folder paths
DOCUMENTS_FOLDER = "docs"
FILES_FOLDER = os.path.join(DOCUMENTS_FOLDER, "DOCX") # PDFs
OUTPUT_FOLDER = os.path.join(DOCUMENTS_FOLDER, "Processed", MODEL)
EXPORT_FILE_NAME = f"{datetime.now().strftime("%y%m%d")}-{MODEL}-processed_documents.pkl"

## Helper functions

In [3]:
def flatten_dict(d: dict):
    """Recursively flatten a dictionary with nested lists."""
    flattened_dict = dict()
    for k, v in d.items():
        flat_list = []
        if isinstance(v, list):
            flat_list.extend(flatten_list(v))
        elif isinstance(v, dict):
            flat_list.extend(flatten_dict(v))
        
        flattened_dict[k] = flat_list
    return flattened_dict

def flatten_list(l: list):
    """Recursively flatten a nested list."""
    flat_list = []
    for item in l:
        if isinstance(item, list):
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

## Initialize File Analyzer
Handes all files, folder processing, text extraction, question answering

In [4]:
# Define questions for the analyzer
questions =  [
    "How do the media in this article frame the public discussion about ChatGPT? Are there certain **metaphors** that keep cropping up?",
    "Which role does or might the Arabic World play in the development of Artificial Intelligence? Answer with 'Not mentioned' if not applicable.",
    "Which use cases of Artificial Intelligence are helpful for the Arabic world based on this article?",
    "What is the final message of the article that the author wants to convey? Keep your answer short and precise!"]

In [5]:
# Initialize llm as instance of OllamaMediaAnalysis
llm = OllamaMediaAnalysis(model_name=MODEL, system_prompt=SYSTEM_PROMPT, debug=True)

# Initialize analyzer
analyzer = FileAnalyzer(ollama_handler=llm, entity_collection="spacy", spacy_model=SPACY_MODEL, file_name=EXPORT_FILE_NAME, output_folder=OUTPUT_FOLDER, questions=questions, debug=False, speed_debug=False)

In [6]:
if PROCESS_DOCUMENTS:
    # Process the documents in the folder where the PDFs are
    documents = analyzer.process_folder(FILES_FOLDER, file_types=(".docx"))

    # Save documents to the output folder
    analyzer.save_documents(documents)

23:21:33 Analyzing file from folder: ChatGPT AI grows more powerful as we become more predictable_standardizedlayout.docx
23:21:33	 Adding paragraph 1/11 with 32 characters
23:21:33	 Adding paragraph 2/11 with 19 characters
23:21:33	 Adding paragraph 3/11 with 13 characters
23:21:33	 Adding paragraph 4/11 with 19 characters
23:21:33	 Adding paragraph 5/11 with 36 characters
23:21:33	 Adding paragraph 6/11 with 20 characters
23:21:33	 Adding paragraph 7/11 with 11 characters
23:21:33	 Adding paragraph 8/11 with 0 characters
23:21:33	 Adding paragraph 9/11 with 0 characters
23:21:33	 Adding paragraph 10/11 with 0 characters
23:21:33	 Adding paragraph 11/11 with 5560 characters
23:21:33	 Create Docx document <ChatGPT AI grows mor...> with content of length 5523
23:21:33 Initialized Document: <ChatGPT AI grows more powerful as we become more predictable>
23:21:33	 Generating tokenized content
23:21:33	 Generating short summary
Validation Error: 1 validation error for ShortSummary
short_sum

In [7]:
# Load already analyzed documents
analyzer.load_documents(os.path.join(OUTPUT_FOLDER, EXPORT_FILE_NAME), load_latest=False)

23:34:32 Initialized Document: <ChatGPT AI grows more powerful as we become more predictable>
23:34:32 Initialized Document: <ChatGPT is the Netscape moment' for artificial intelligence'>
23:34:32 Initialized Document: <ChatGPT outperforms copywriters in STEP Conference's outdoor adverts>
23:34:32 Initialized Document: <AI is not smarter than humans>
23:34:32 Initialized Document: <No need to demonize ChatGPT but AI regulation is a must>
23:34:32 Initialized Document: <Is the Arab world ready for the uncertain age of AI-powered web tools>
23:34:32 Initialized Document: <I am not here to take your job,' ChatGPT tells Frankly Speaking host>
23:34:32 Initialized Document: <Will ChatGPT and AI have an impact on Saudi workforce productivity>


## Get insights of all documents

In [21]:
# Print a summary of every loaded file (Optional: with markings)
for doc in analyzer:
    #analyzer.create_wordcloud(doc,wordcloud_names=["markings", "content", "summary"])
    #doc.summary = llm.generate_summary(doc.content)
    #result = llm.analyze_sentiment(doc.content))
    #result = doc.sentimentsd

    doc.content = analyzer.clean_input(doc.content)
    doc.content_tokens = analyzer.get_tokens(doc.content)
    #doc.sentiment = result.get("sentiment_value")
    #doc.sentiment_reason = result.get("sentiment_reason")
    print(doc.get_info(), end="\n"+"- "*50+"\n"*2)

Title: ChatGPT AI grows more powerful as we become more predictable
Short Summary: The article discusses the rapid advancement of ChatGPT, an AI language model by OpenAI, highlighting its potential in revolutionizing information dissemination but also noting its constrained intelligence and ethical concerns due to potential ideological reinforcement.
Summary:
1. **Rapid Adoption Rate**: The AI language model ChatGPT, owned by OpenAI, has seen an unprecedented rapid adoption rate within five days of its release, with over a million unique users. 2. **AI as Predictive Tool**: OpenAI's approach to ChatGPT is rooted in the notion that human behavior is predictable, enabling it to make guesses about trends based on large data sets. This reflects the current dominance of algorithm-driven internet usage and smartphone habits. 3. **Limitations of Predictability**: While AI tools can mimic human thought using vast amounts of data, they lack the fundamental ability to reason like humans do. Huma

In [66]:
# OPTIONAL: Stores an updates version of the documents
#analyzer.save_documents(analyzer.all_documents)

### Export the files as word-docx and markdown files

In [10]:
# Export docx files with wordclouds
analyzer.export_docx_files()

In [11]:
# Write a markdown file for every document
analyzer.export_markdown_files()

## Analysis of all files

### Apply latent dirichlet allocation algorithm
Algorighm selects all topics out of the articles. LLM then adds a title that summarizes the topics into categories. 

Thereby, all different topics can be extracted out of **all** documents.

In [45]:
# Get the content of all documents
all_content_tokens = [doc.content_tokens for doc in analyzer.all_documents]


# Create a document-term matrix
max_df = 0.8   # means "ignore terms that appear in more than 90% of documents".
min_df = 0.1  # means "ignore terms that appear in less than 20% documents".

vectorizer = CountVectorizer(
    max_df=max_df,
    min_df=min_df,
    stop_words="english",
    ngram_range=(1, 2) # Capture multi-word expressions
)
doc_term_matrix = vectorizer.fit_transform(all_content_tokens)

# Apply LDA
lda = LatentDirichletAllocation(n_components=5, learning_method="batch", random_state=42, n_jobs=-1, learning_decay=0.5)
lda.fit(doc_term_matrix)

In [46]:
# Function to generate unique topics
def get_unique_topics(model, vectorizer, top_n=10):
    unique_topics = {}
    for idx, topic in enumerate(model.components_):
        # Get the top features for the topic
        top_features = tuple(vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1])
        
        # Use the tuple as a key to ensure uniqueness
        if top_features not in unique_topics:
            unique_topics[top_features] = idx

    return unique_topics

# Generate a title for each unique topic
def generate_topic_titles(llm, unique_topics):
    titles = {}
    
    for features, idx in unique_topics.items():
        # Create a prompt with the top features
        prompt = (
            "You are tasked with creating a concise, descriptive title for a topic derived from text analysis. "
            "The title should reflect the general theme or idea represented by the following features. "
            "Avoid listing all feature names explicitly, and ensure the title is engaging and informative. "
            "Aim for a maximum of 5 words. Here are the features: "
            f"{', '.join(features)}"
        )
        
        # Use the LLM to generate a title
        response = llm.ollama.generate(model=llm.model, prompt=prompt)
        titles[idx] = response["response"].split("\n")[0]
    return titles


# Get unique topics and their titles
unique_topics = get_unique_topics(lda, vectorizer, top_n=15)
topic_titles = generate_topic_titles(llm, unique_topics)


analyzer.analysis["LDA"] = {topic_titles[idx].replace('"', ""): topics for idx, topics in zip(topic_titles, unique_topics)}

# Print unique topics and their generated titles
for idx, (title, topics) in enumerate(analyzer.analysis["LDA"].items()):

    print(f"Topic {idx} - {title}:")
    print(f"Features: {', '.join(topics)}")

Topic 0 - AI-Driven Medical Threat Analysis & Reporting - Team Response Unveiled:
Features: generate, question, ai tool, step, report, need, threat, medical, raise, team, field, article, explain, fear, accord
Topic 1 - Breaking Boundaries: Google & OpenAI-Fueled Text Analysis Revolution:
Features: say, google, answer, chatbot, billion, arab, openai, search, web, text, tech, base, people, way, program
Topic 2 - Linguistic AI Tool: Large-Scale Predictability in Saudi Context, Trained on Word Examples.:
Features: model, language, ability, ai tool, example, reason, large, time, predictable, word, development, platform, saudi, point, train
Topic 3 - Language Analysis in Job Training: Unveiling Khoury's Language Model Impact on Employee News Response:
Features: language, say, job, training, impact, response, model, training datum, khoury, language model, employee, news, provide, information, source
Topic 4 - Microsoft's Pioneering Think-Powered Life-Enhancing Tool Boosts Realization and Effi

### Apply Term frequency inverse term frequency
This model iterates over each document and returns those words, that do not appear often in other documents. The top n words are then used to create a topic for every article!

- A high TF-IDF score (FROM_LOW_TO_HIGH = False) indicates that a word is both important within a document and rare across all document.
- A low TF-IDF score suggests that a word is either common in the document but rare overall, or vice versa.

By analyzing TF-IDF scores for a set of words, you can identify:
Important keywords in a document
Rare or unique words that distinguish one document from another
Words with varying levels of importance across different documents


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import product

# Define the parameter grid
param_grid = {
    'max_df': [0.8, 0.9, 1.0],
    'min_df': [1, 2, 3],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'norm': ['l1', 'l2'],
}

# Initialize variables to store the best combination and vectorizer
best_params = None
best_vectorizer = None
best_vocab_size = 0  # Use vocabulary size as a proxy for quality

# Iterate over all combinations of parameters
for params in product(*param_grid.values()):
    # Map parameter combinations to their names
    params_dict = dict(zip(param_grid.keys(), params))
    
    # Create and fit the TfidfVectorizer with the current parameters
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_df=params_dict['max_df'],
        min_df=params_dict['min_df'],
        ngram_range=params_dict['ngram_range'],
        norm=params_dict['norm']
    )
    tfidf_matrix = vectorizer.fit_transform(all_content_tokens)
    
    # Evaluate based on vocabulary size
    vocab_size = len(vectorizer.get_feature_names_out())
    print(f"Params: {params_dict}, Vocabulary Size: {vocab_size}")
    
    # Track the best combination
    if vocab_size > best_vocab_size:
        best_vocab_size = vocab_size
        best_params = params_dict
        best_vectorizer = vectorizer

# Print the best parameters and corresponding vocabulary size
print("\nBest Parameters:", best_params)
print("Best Vocabulary Size:", best_vocab_size)

# Use the best vectorizer to transform the data
tfidf_matrix = best_vectorizer.transform(all_content_tokens)

Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}, Vocabulary Size: 1572
Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}, Vocabulary Size: 1572
Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}, Vocabulary Size: 5803
Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2'}, Vocabulary Size: 5803
Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 3), 'norm': 'l1'}, Vocabulary Size: 10428
Params: {'max_df': 0.8, 'min_df': 1, 'ngram_range': (1, 3), 'norm': 'l2'}, Vocabulary Size: 10428
Params: {'max_df': 0.8, 'min_df': 2, 'ngram_range': (1, 1), 'norm': 'l1'}, Vocabulary Size: 586
Params: {'max_df': 0.8, 'min_df': 2, 'ngram_range': (1, 1), 'norm': 'l2'}, Vocabulary Size: 586
Params: {'max_df': 0.8, 'min_df': 2, 'ngram_range': (1, 2), 'norm': 'l1'}, Vocabulary Size: 821
Params: {'max_df': 0.8, 'min_df': 2, 'ngram_range': (1, 2), 'norm': 'l2'}, Vocabulary Size: 821
Params: {'max_df': 0.8, 'min_df'

In [60]:
# Create a TfidfVectorizer object
min_df = int(len(analyzer.all_documents)//2)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', norm="l2", max_df=0.8, min_df=min_df, ngram_range=(1,3)) 

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(all_content_tokens)

# Get the feature names (i.e., words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for better readability
df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Show the TF-IDF values for each term in each document
#print("TF-IDF Matrix:")
#print(df)

# Display the most important words (top N) for each document
TOP_N = 15
FROM_LOW_TO_HIGH = False

analyzer.analysis["TFIDF"] = dict()

for i, row in df.iterrows():
    doc = analyzer.all_documents[i]
    print(f"\nTop {TOP_N} terms for Document {i + 1}: <{doc.title}>")
          
    # Generate a title for each unique topic
    top_terms = row.sort_values(ascending=FROM_LOW_TO_HIGH).head(TOP_N*2)
    
    indices = top_terms.index
    values = top_terms.values
    
    # Create a prompt with the top features
    prompt = (
        "Generate a concise and meaningful title, exactly four words long, that summarizes the following features. "
        "The title should capture the main theme or topic of these features. "
        "Example outputs: 'Language Revolution', 'Shaping Future Technology Trends', 'Global Knowledge Network'."
        f"Features: {'\n'.join([f'{str(indices[i])} - {values[i]}' for i in range(len(indices))])}"
        )
            
    # Use the LLM to generate a title
    title = llm.ollama.generate(model=llm.model, prompt=prompt)["response"]
    
    analyzer.analysis["TFIDF"].update({doc: {"title": title, "terms": top_terms[:TOP_N]}})
    
    print(title, "\n", ", ".join(top_terms[:TOP_N].index))


Top 15 terms for Document 1: <ChatGPT AI grows more powerful as we become more predictable>
"AI-Powered Language Model Development: Ethical Implications for Government and Industry" 
 ai tool, language, development, openai, work, change, use, ability, chatgpt ai, produce, model, time, large, set, think

Top 15 terms for Document 2: <ChatGPT is the Netscape moment' for artificial intelligence'>
"Leveraging Saudi Language Modeling: Challenges, Research, and Progress in Natural Language Development" 
 model, example, train, text, saudi, word, generate, language, value, challenge, require, research, base, point, learn

Top 15 terms for Document 3: <ChatGPT outperforms copywriters in STEP Conference's outdoor adverts>
"AI Chatbot: Job Transformation & Global Knowledge Network Expansion" 
 job, chatbot, ai tool, use, need, explain, think, easy, continue, replace human, add, say, make, look, tell

Top 15 terms for Document 4: <AI is not smarter than humans>
"Microsoft's Chatbot: Language Ski

In [61]:
content = json.dumps({doc.title: doc.content for doc in analyzer})

# Iterates over each question, provides answers to LLM and let them summarize
for question in questions:
    content = {doc.title: doc.answers.get(question) for doc in analyzer}
    response = llm.answer_question(text=json.dumps(content), question=question, multiple_articles=True)
    analyzer.analysis[question] = response

00:17:32	 Answering question <How do the media in this artic...>
00:17:47	 Answering question <Which role does or might the A...>
00:17:49	 Answering question <Which use cases of Artificial ...>
00:17:59	 Answering question <What is the final message of t...>


In [62]:
answers_questions = dict()
# Iterates over each question, provides answers to LLM and let them summarize
topic_question_all = (
    "Attached are the topics of every article. "
    "What **perspectives and aspects** are being widely covered? Which aspects are being ignored? "
    "In your answer consider topics such as, but not only, data privacy, costs/affordability, know-how, complexity, accuracy, accessibility, bias (towards age, gender, religion, sexuality), risks, opportunity, perception, limitations."
    "These are the topics of all arcticles: ")

# Example usage:
topic_clusters = {doc.title: [value for value in doc.topic_clusters.values()] for doc in analyzer}
content_topics = flatten_dict(topic_clusters)
response = llm.answer_question(text=json.dumps(content_topics), question=topic_question_all, multiple_articles=True)
analyzer.analysis["topic_question"] = response

00:18:04	 Answering question <Attached are the topics of eve...>


In [68]:
markdown = str()

lda = analyzer.analysis.get("LDA")
markdown += f"# Latent Dirichlet Allocation interpretation\n### Topics that are frequently mentioned in all articles\n- {'\n- '.join(lda.keys())}\n\n"


tfidf = [elem.get("title") for elem in flatten_list(analyzer.analysis.get("TFIDF").values())]
markdown += f"# TF-IDF interpretation for every article\n### Topics that appear in the article very often but not in others\n"

for id, doc in enumerate(analyzer):
    markdown += f"- <{doc.title}>: {tfidf[id]}\n"

markdown += "\n# Hypothesis/Questions\n"
for q_id, question in enumerate(questions + ["topic_question"]):
    answer = analyzer.analysis.get(question)
    markdown += f"## Question {q_id+1}\n"
    markdown += f"*{answer.get("question")}*\n{answer.get("answer")}\nReasoning: {answer.get("reasoning")}\n\n"
    
    
print(markdown)

markdown_path = os.path.join(analyzer.output_folder, "research summary.md")
analyzer.export_markdown(markdown_path, markdown)

# Latent Dirichlet Allocation interpretation
### Topics that are frequently mentioned in all articles
- AI-Driven Medical Threat Analysis & Reporting - Team Response Unveiled
- Breaking Boundaries: Google & OpenAI-Fueled Text Analysis Revolution
- Linguistic AI Tool: Large-Scale Predictability in Saudi Context, Trained on Word Examples.
- Language Analysis in Job Training: Unveiling Khoury's Language Model Impact on Employee News Response
- Microsoft's Pioneering Think-Powered Life-Enhancing Tool Boosts Realization and Efficiency

# TF-IDF interpretation for every article
### Topics that appear in the article very often but not in others
- <ChatGPT AI grows more powerful as we become more predictable>: "AI-Powered Language Model Development: Ethical Implications for Government and Industry"
- <ChatGPT is the Netscape moment' for artificial intelligence'>: "Leveraging Saudi Language Modeling: Challenges, Research, and Progress in Natural Language Development"
- <ChatGPT outperforms copy