#Set up the API

In [None]:
# Installing the required packages
%pip install -U openai pydantic

In [2]:
# Set environment variables and create openai client

import os
from openai import AzureOpenAI
from google.colab import userdata # To get the secret keys

# Deployment name in azure openai studio
gpt_model = "gpt-4o-mini"  # ex. gpt-4o-mini

client = AzureOpenAI(
    api_key=userdata.get('AZURE_OPENAI_API_KEY'),
    api_version="2023-03-15-preview",
    azure_endpoint=userdata.get('AZURE_OPENAI_ENDPOINT'),
)

In [None]:
# Test that everything is working
messages = [{"role": "user", "content": "Hello, ready for some AI analysis?"}]
response = client.chat.completions.create(model=gpt_model, messages=messages, max_tokens=50)

print("Response: \n", response.choices[0].message.content)

# Expected respnse should be something like:
# Response:
#  Absolutely! What do you need analyzed?

#Food-debate


In [None]:
import json
from openai.types.chat import ChatCompletionUserMessageParam, ChatCompletionSystemMessageParam

path_to_file = "Test.txt"

# Read the data from the file
with open(path_to_file, "r", encoding="UTF-8") as file:
    data = file.read()

messages = [
    ChatCompletionSystemMessageParam(
        role="system",
        content="""
            You are an expert in language analysis and interaction analysis in discussions, with a focus on identifying all unique speakers in dialogue.
Your task is to analyze the entire text provided and:

1. Identify and label each unique speaker based on their speech patterns, contextual cues, or dialogue markers.
2. Assign a unique identifier to each speaker (e.g., "Speaker 1," "Speaker 2," etc.) and determine their gender (M/F) when possible based on context.
3. Include all participants present in the text and ensure no speaker is omitted. Do not limit the identification to a predefined number of examples.
4. Look for shifts in tone, specific language usage, or structural cues in the dialogue to differentiate speakers.
3. Examine Interaction Styles:
            - Detailed Tone Categorization: Break down the tone analysis into subcategories, such as empathetic, argumentative, defensive, or conciliatory, to gain a more comprehensive understanding of the interaction style.
            - Emotion Detection: Incorporate sentiment analysis tools to detect emotions like frustration, enthusiasm, or sarcasm. This helps capture subtle nuances in interactions that extend beyond simple supportive or critical categories.
            - Power Dynamics: Analyze indications of power imbalances, such as frequent interruptions, speaking over others, or dismissive responses.
4. Use an analysis model: Analyze how gender patterns may influence treatment in interactions, while controlling for other variables.

Analyze the interactions between all participants and categorize them into gender-based combinations (M to M, F to M, F to F, M to F). Ensure the output is presented in JSON format as follows:
{
    "episode_name": "<generated_name>",
    "participants": [
        {
            "role": "<role>",
            "gender": "<M/F>",
            "background": "(party affiliation, organization, etc.)"
        }
    ],
    "interaction_analysis": {
        "M_to_M": {
            "summary": "<summary>",
            "score": <number>
        },
        "F_to_F": {
            "summary": "<summary>",
            "score": <number>
        },
        "F_to_M": {
            "summary": "<summary>",
            "score": <number>
        },
        "M_to_F": {
            "summary": "<summary>",
            "score": <number>
        }
    },
    "overall_trends": "<description>",
    "final_score": <number>
}
""",
    ),
    ChatCompletionUserMessageParam(
        role="user",
        content=f"""
            Analyze the text below and categorize participants as “debater 1,” “debater 2,” “expert 1,” etc.
            Evaluate how they interact with each other in all gender combinations (M to M, F to M, F to F, M to F).
            Include all participants and do not include the moderator in the analysis.
            In English only.

            <data>{data}</data>

        Response:
        """,
    ),
]

completion = client.chat.completions.create(
    model="gpt-4o-mini",  # Replace with your actual model name
    messages=messages,
    temperature=0,
    response_format={"type": "json_object"},  # Guarantees that the response is in JSON format
)

assert completion.choices[0].message.content is not None

# Print and parse the JSON response
print(completion.choices[0].message.content)

analysis_result = json.loads(completion.choices[0].message.content)
print("Overall score for analysis:", analysis_result["final_score"])


#Political Debate


In [126]:
import json
from openai.types.chat import ChatCompletionUserMessageParam, ChatCompletionSystemMessageParam

path_to_file = "Test.vtt"

# Read the data from the file
with open(path_to_file, "r", encoding="UTF-8") as file:
    data = file.read()

messages = [
    ChatCompletionSystemMessageParam(
        role="system",
        content="""
            You are an expert in language analysis and interaction analysis in discussions, with a focus on identifying all unique speakers in dialogue.
Your task is to analyze the entire text provided and:

1. Identify and label each unique speaker based on their speech patterns, contextual cues, or dialogue markers.
2. Assign a unique identifier to each speaker (e.g., "Speaker 1," "Speaker 2," etc.) and determine their gender (M/F) when possible based on context.
3. Include all participants present in the text and ensure no speaker is omitted. Do not limit the identification to a predefined number of examples.
4. Look for shifts in tone, specific language usage, or structural cues in the dialogue to differentiate speakers.
5. Comprehensive Interaction Analysis: Evaluate and score the interactions between all debaters, not just a sample. This analysis should include every interaction and response in the provided text to ensure an accurate and holistic assessment of how each participant interacts with others.
  Scoring Categories:
    -Argument Strength: Rate the clarity, relevance, and persuasiveness of each participant’s arguments throughout the discussion.
    -Engagement Style: Assess interaction qualities such as empathy, assertiveness, defensiveness, or conciliation as demonstrated across all dialogue.
    -Tone Dynamics: Identify variations in tone such as supportive, critical, sarcastic, or enthusiastic exchanges, considering the entire range of interactions.
    -Power Dynamics: Score power displays or imbalances, such as interruptions, speaking over others, or dismissive responses, over the entire debate.
6. Use an analysis model: Analyze how gender patterns may influence treatment in interactions, while controlling for other variables.
7. Final Scoring: Combine scores across all categories for each participant to assign a final score reflecting their overall performance and engagement style in the full debate.

Analyze the interactions between all participants and categorize them into gender-based combinations (M to M, F to M, F to F, M to F). Ensure the output is presented in JSON format as follows:
{
    "episode_name": "<generated_name>",
    "participants": [
        {
            "role": "<role>",
            "gender": "<M/F>",
            "background": "(party affiliation, organization, etc.)"
        }
    ],
    "interaction_analysis": {
        "M_to_M": {
            "summary": "<summary>",
            "score": <number>
        },
        "F_to_F": {
            "summary": "<summary>",
            "score": <number>
        },
        "F_to_M": {
            "summary": "<summary>",
            "score": <number>
        },
        "M_to_F": {
            "summary": "<summary>",
            "score": <number>
        }
    },
    "overall_trends": "<description>",
    "final_score": <number>
}
""",
    ),
    ChatCompletionUserMessageParam(
        role="user",
        content=f"""
            Analyze the text below and categorize participants as “debater 1,” “debater 2,” “expert 1,” etc not their real name.
            Evaluate how they interact with each other in all gender combinations (M to M, F to M, F to F, M to F).
            Include all participants and do not include the moderator in the analysis.
            In English only.

            <data>{data}</data>

        Response:
        """,
    ),
]

completion = client.chat.completions.create(
    model="gpt-4o-mini",  # Replace with your actual model name
    messages=messages,
    temperature=0.5,
    response_format={"type": "json_object"},  # Guarantees that the response is in JSON format
)

assert completion.choices[0].message.content is not None

# Print and parse the JSON response
print(completion.choices[0].message.content)

analysis_result = json.loads(completion.choices[0].message.content)
print("Overall score for analysis:", analysis_result["final_score"])


{
    "episode_name": "Norwegian Political Debate Analysis",
    "participants": [
        {
            "role": "Debater 1",
            "gender": "M",
            "background": "Leader of the Conservative Party"
        },
        {
            "role": "Debater 2",
            "gender": "F",
            "background": "Leader of the Labour Party"
        },
        {
            "role": "Debater 3",
            "gender": "F",
            "background": "Leader of the Socialist Left Party"
        },
        {
            "role": "Debater 4",
            "gender": "M",
            "background": "Leader of the Progress Party"
        },
        {
            "role": "Debater 5",
            "gender": "F",
            "background": "Leader of the Centre Party"
        },
        {
            "role": "Debater 6",
            "gender": "F",
            "background": "Leader of the Christian Democratic Party"
        }
    ],
    "interaction_analysis": {
        "M_to_M": {
            "su

In [66]:
# Konverter interaction_analysis til en DataFrame og legg til episode-navn
df_interaction_analysis = pd.DataFrame.from_dict(analysis_result["interaction_analysis"], orient='index').reset_index()
df_interaction_analysis.columns = ['interaction_type', 'summary', 'score']
df_interaction_analysis['episode_name'] = analysis_result["episode_name"]

# Vis DataFrame
df_interaction_analysis

Unnamed: 0,interaction_type,summary,score,episode_name
0,M_to_M,Interactions between male participants were ge...,7,Debatten on Rising Food Prices
1,F_to_F,Female participants engaged in collaborative d...,8,Debatten on Rising Food Prices
2,F_to_M,"Female participants often led discussions, wit...",6,Debatten on Rising Food Prices
3,M_to_F,Male participants showed respect and attentive...,7,Debatten on Rising Food Prices


In [67]:
# Lag en matrise med poeng for interaksjoner der radene og kolonnene representerer kjønn (M og F)
gender_matrix = pd.DataFrame(index=["M", "F"], columns=["M", "F"])

# Fyll ut matrisen basert på interaksjonstype i DataFrame
for _, row in df_interaction_analysis.iterrows():
    if row['interaction_type'] == "M_to_M":
        gender_matrix.at["M", "M"] = row['score']
    elif row['interaction_type'] == "F_to_F":
        gender_matrix.at["F", "F"] = row['score']
    elif row['interaction_type'] == "F_to_M":
        gender_matrix.at["F", "M"] = row['score']
    elif row['interaction_type'] == "M_to_F":
        gender_matrix.at["M", "F"] = row['score']

# Vis matrisen
gender_matrix


Unnamed: 0,M,F
M,7,7
F,6,8


#Bigdata

In [108]:
import requests
import yt_dlp
import os

# Base URL for NRK's API
base_url = "https://psapi.nrk.no"
series_url = f"{base_url}/tv/catalog/series/debatten"

# Antall episoder vi ønsker å hente, justert for å ignorere første
target_episodes = 9
episodes_needed = target_episodes + 1  # Hent 10 episoder, men ignorer den første
episodes_fetched = 0
page = 1

try:
    all_episodes = []

    # Hent episoder fra API-et
    while len(all_episodes) < episodes_needed:
        response = requests.get(series_url, params={"page": page})
        response.raise_for_status()
        series_data = response.json()

        if '_embedded' in series_data and 'instalments' in series_data['_embedded']:
            instalment_data = series_data['_embedded']['instalments']
            if '_embedded' in instalment_data and 'instalments' in instalment_data['_embedded']:
                episodes = instalment_data['_embedded']['instalments']
                all_episodes.extend(episodes)

                # Sjekk om det er flere sider
                if '_links' in instalment_data and 'next' in instalment_data['_links']:
                    page += 1
                else:
                    break  # Ingen flere sider
            else:
                break  # Ingen flere episoder funnet
        else:
            break  # Ingen flere episoder funnet

    # Fjern den første episoden og samle URL-ene for resten
    all_episodes = all_episodes[1:episodes_needed]  # Ignorer den første episoden

    # Samle URL-ene i en liste kalt URL
    URL = [f'https://tv.nrk.no/se?v={episode["prfId"]}' for episode in all_episodes]

    # Print en melding som viser hvor mange episoder som er hentet
    print(f"Episodes of Debatten (totalt {len(URL)}):")

    # Funksjon for å laste ned undertekster
    def download_subtitles(url, download_path):
        ydl_opts = {
            'writesubtitles': True,
            'subtitleslangs': ['nb-ttv'],  # Norsk bokmål TV-teksting
            'skip_download': True,  # Hopp over nedlasting av video
            'outtmpl': os.path.join(download_path, '%(title)s.%(ext)s'),
            'quiet': True,  # Reduserer mengden output til et minimum
            'no_warnings': True,  # Skjuler advarsler
            'postprocessors': [{
                'key': 'FFmpegSubtitlesConvertor',
                'format': 'srt',
            }],
            'logger': None,  # Fjerner logger-objektet for mindre støy
        }

        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            print(f"Subtitles downloaded for URL: {url}")
            return True  # Returner True hvis nedlasting var vellykket
        except Exception as e:
            print(f"En feil oppstod for {url}: {e}")
            return False  # Returner False hvis det oppstod en feil

    # Opprett mappe for lagring av undertekster
    download_path = os.path.join(os.getcwd(), "Debatten_subtitles")
    os.makedirs(download_path, exist_ok=True)

    # Last ned undertekster for hver URL i listen
    valid_urls = []
    for i, url in enumerate(URL):
        success = download_subtitles(url, download_path)
        if not success:
            print("Prøver neste tilgjengelige episode...")
            continue  # Gå videre til neste episode
        valid_urls.append(url)

    if len(valid_urls) < target_episodes:
        print(f"Kun {len(valid_urls)} episoder ble lastet ned vellykket.")
    else:
        print("All episodes fetched sucsessfully.")

except requests.RequestException as e:
    print(f"En feil oppstod ved henting av data: {e}")


Episodes of Debatten (totalt 9):
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51103124
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51102924
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51102424
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51102224
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51101724
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51101524
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51101024
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51100824
Subtitles downloaded for URL: https://tv.nrk.no/se?v=NNFA51100324
All episodes fetched sucsessfully.


In [132]:
import os
import json
import openai


# Katalog som inneholder VTT-filer
directory_path = "Debatten_subtitles"

# Liste for å lagre analyseresultater for alle filer
analysis_results = []

# Iterer gjennom alle VTT-filer i katalogen
for filename in os.listdir(directory_path):
    if filename.endswith(".vtt"):
        file_path = os.path.join(directory_path, filename)

        # Les data fra VTT-filen
        with open(file_path, "r", encoding="UTF-8") as file:
            data = file.read()

        # Opprett meldingene for forespørselen
        messages = [
            {
                "role": "system",
                "content": """
                    You are an expert in language analysis and interaction analysis in discussions, with a focus on identifying all unique speakers in dialogue.
                    Your task is to analyze the entire text provided and:

                    1. Identify and label each unique speaker based on their speech patterns, contextual cues, or dialogue markers.
                    2. Assign a unique identifier to each speaker (e.g., "Speaker 1," "Speaker 2," etc.) and determine their gender (M/F) when possible based on context.
                    3. Include all participants present in the text and ensure no speaker is omitted. Do not limit the identification to a predefined number of examples.
                    4. Look for shifts in tone, specific language usage, or structural cues in the dialogue to differentiate speakers.
                    5. Comprehensive Interaction Analysis: Evaluate and score the interactions between all debaters, not just a sample. This analysis should include every interaction and response in the provided text to ensure an accurate and holistic assessment of how each participant interacts with others.
                      Scoring Categories:
                        -Argument Strength: Rate the clarity, relevance, and persuasiveness of each participant’s arguments throughout the discussion.
                        -Engagement Style: Assess interaction qualities such as empathy, assertiveness, defensiveness, or conciliation as demonstrated across all dialogue.
                        -Tone Dynamics: Identify variations in tone such as supportive, critical, sarcastic, or enthusiastic exchanges, considering the entire range of interactions.
                        -Power Dynamics: Score power displays or imbalances, such as interruptions, speaking over others, or dismissive responses, over the entire debate.
                    6. Use an analysis model: Analyze how gender patterns may influence treatment in interactions, while controlling for other variables.
                    7. Final Scoring: Combine scores across all categories for each participant to assign a final score reflecting their overall performance and engagement style in the full debate.
                    Analyze the interactions between all participants and categorize them into gender-based combinations (M to M, F to M, F to F, M to F). Ensure the output is presented in JSON format as follows:
                    {
                        "episode_name": "<generated_name>",
                        "participants": [
                            {
                                "role": "<role>",
                                "gender": "<M/F>",
                                "background": "(party affiliation, organization, etc.)"
                            }
                        ],
                        "interaction_analysis": {
                            "M_to_M": {
                                "summary": "<summary>",
                                "score": <number>
                            },
                            "F_to_F": {
                                "summary": "<summary>",
                                "score": <number>
                            },
                            "F_to_M": {
                                "summary": "<summary>",
                                "score": <number>
                            },
                            "M_to_F": {
                                "summary": "<summary>",
                                "score": <number>
                            }
                        },
                        "overall_trends": "<description>",
                        "final_score": <number>
                    }
                """
            },
            {
                "role": "user",
                "content": f"""
                    Analyze the text below and categorize participants as “debater 1,” “debater 2,” “expert 1,” etc not their real name.
                    Evaluate how they interact with each other in all gender combinations (M to M, F to M, F to F, M to F).
                    Include all participants and do not include the moderator in the analysis.
                    In English only.

                    <data>{data}</data>

                    Response:
                """
            }
        ]

        # Forespørsel til OpenAI API
        completion = client.chat.completions.create(
        model="gpt-4o-mini",  # Replace with your actual model name
        messages=messages,
        temperature=0.5,
        response_format={"type": "json_object"},  # Guarantees that the response is in JSON format
)

        # Analyser og lagre resultatet
        if completion.choices[0].message.content:
            analysis_result = json.loads(completion.choices[0].message.content)
            analysis_result["episode_name"] = filename.replace(".vtt", "")  # Legg til episodenavn
            analysis_results.append(analysis_result)

            # Print melding når analysen er ferdig for en episode
            print(f"Finished analyzing: {filename}")

# Print den endelige JSON-variabelen som inneholder alle resultater
final_json_result = json.dumps(analysis_results, indent=4)
print(final_json_result)


Finished analyzing: Debatten - 8. okt. · Tidenes budsjett, men misnøyen vokser.nb-ttv.vtt
Finished analyzing: Debatten - 17. okt. · Slik kan du spare på maten.nb-ttv.vtt
Finished analyzing: Debatten - 24. okt. · Unge sender Ap mot sperregrensa.nb-ttv.vtt
Finished analyzing: Debatten - Torsdag · Milliarder til norske batterier.nb-ttv.vtt
Finished analyzing: Debatten - 22. okt. · Milliardene flyr i havvinden.nb-ttv.vtt
Finished analyzing: Debatten - 10. okt. · Bløffebudsjett, mener opposisjonen.nb-ttv.vtt
Finished analyzing: Debatten - 15. okt. · Skoler legges ned, bygder dør.nb-ttv.vtt
Finished analyzing: Debatten - 3. okt. · Hvorfor bomber de hverandre？.nb-ttv.vtt
Finished analyzing: Debatten - 29. okt. · Du blir lurt.nb-ttv.vtt
[
    {
        "episode_name": "Debatten - 8. okt. \u00b7 Tidenes budsjett, men misn\u00f8yen vokser.nb-ttv",
        "participants": [
            {
                "role": "Debater 1",
                "gender": "M",
                "background": "Political e

##Make to DF

In [133]:
import pandas as pd

# Convert the JSON results into DataFrames
scores_data = []
participants_data = []

for result in analysis_results:
    episode_name = result["episode_name"]

    # Add score data
    for interaction, details in result["interaction_analysis"].items():
        scores_data.append({
            "episode_name": episode_name,
            "interaction_type": interaction,
            "summary": details["summary"],
            "score": details["score"]
        })

    # Add participant data
    for participant in result["participants"]:
        participants_data.append({
            "episode_name": episode_name,
            "role": participant["role"],
            "gender": participant["gender"],
            "background": participant["background"]
        })

# Create DataFrames
scores_df = pd.DataFrame(scores_data)
participants_df = pd.DataFrame(participants_data)
# Display DataFrames using pandas display method
print("Scores DataFrame:")
display(scores_df)

print("\nParticipants DataFrame:")
display(participants_df)



Scores DataFrame:


Unnamed: 0,episode_name,interaction_type,summary,score
0,"Debatten - 8. okt. · Tidenes budsjett, men mis...",M_to_M,The male debaters engage in critical exchanges...,7
1,"Debatten - 8. okt. · Tidenes budsjett, men mis...",F_to_F,The female experts engage in supportive dialog...,8
2,"Debatten - 8. okt. · Tidenes budsjett, men mis...",F_to_M,The female experts present their arguments cle...,7
3,"Debatten - 8. okt. · Tidenes budsjett, men mis...",M_to_F,The male debaters often interrupt or dominate ...,6
4,Debatten - 17. okt. · Slik kan du spare på mat...,M_to_M,Interactions between male participants were ch...,7
5,Debatten - 17. okt. · Slik kan du spare på mat...,F_to_F,Female participants engaged in supportive dial...,8
6,Debatten - 17. okt. · Slik kan du spare på mat...,F_to_M,Interactions between female and male participa...,7
7,Debatten - 17. okt. · Slik kan du spare på mat...,M_to_F,Male participants provided encouragement and v...,8
8,Debatten - 24. okt. · Unge sender Ap mot sperr...,M_to_M,Debater 1 and Debater 2 engage in a back-and-f...,7
9,Debatten - 24. okt. · Unge sender Ap mot sperr...,F_to_F,No female participants are present in this deb...,0



Participants DataFrame:


Unnamed: 0,episode_name,role,gender,background
0,"Debatten - 8. okt. · Tidenes budsjett, men mis...",Debater 1,M,"Political editor, Swedish newspaper"
1,"Debatten - 8. okt. · Tidenes budsjett, men mis...",Expert 1,F,"Professor, Institute of Economics, BI"
2,"Debatten - 8. okt. · Tidenes budsjett, men mis...",Expert 2,M,"Professor of Economics, UIS"
3,"Debatten - 8. okt. · Tidenes budsjett, men mis...",Expert 3,M,"Professor of Economic History, UiO"
4,"Debatten - 8. okt. · Tidenes budsjett, men mis...",Expert 4,F,"Senior researcher, Folkehelseinstituttet and UiO"
...,...,...,...,...
71,Debatten - 29. okt. · Du blir lurt.nb-ttv,Expert 3,F,Director of Economic Crime at Sparebank 1
72,Debatten - 29. okt. · Du blir lurt.nb-ttv,Expert 4,M,Leader of Financial Cyber Crime Center at DNB
73,Debatten - 29. okt. · Du blir lurt.nb-ttv,Debater 3,F,Victim of home visit fraud
74,Debatten - 29. okt. · Du blir lurt.nb-ttv,Expert 5,M,Security manager at Vadla Trygghetsbyrå AS


##Save as xlsx

In [134]:
# Save the DataFrames as separate Excel files in /content/ directory
scores_output_path = '/content/scores_analysis_results.xlsx'
participants_output_path = '/content/participants_analysis_results.xlsx'

# Save each DataFrame to an Excel file
scores_df.to_excel(scores_output_path, index=False)
participants_df.to_excel(participants_output_path, index=False)

(scores_output_path, participants_output_path)


('/content/scores_analysis_results.xlsx',
 '/content/participants_analysis_results.xlsx')