# Setup Environment and Dependencies
Import required libraries and set up logging. Configure environment variables using python-dotenv.

In [None]:
!pip install -r requirements.txt

In [3]:
import logging
import os
from dotenv import load_dotenv
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO)

# Load environment variables from .env file
load_dotenv()

True

# Define KYC Schema and Analyzers
Define the KYC common fields schema and create text and conversation analyzer configurations.

In [4]:
# Define KYC common fields schema
KYC_COMMON_FIELDS = [
    {
        "name": "full_name",
        "type": "string",
        "description": "Customer's full name"
    },
    {
        "name": "birthdate",
        "type": "string",
        "description": "Date of birth extracted from the text"
    },
    {
        "name": "nationality",
        "type": "string",
        "description": "Nationality or citizenship"
    },
    {
        "name": "affiliations",
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "Company": { "type": "string", "description": "Company or organization name" },
                "Position": { "type": "string", "description": "Position or title" },
                "EntryYear": { "type": "string", "description": "Year of entry" }
            }
        },
        "description": "Companies, organizations, or boards the individual is associated with"
    },
    {
        "name": "legal_issues",
        "type": "array",
        "description": "Legal controversies, lawsuits, SEC actions, government investigations"
    },
    {
        "name": "political_exposure",
        "type": "string",
        "description": "A short indicator if the person is politically exposed (PEP) or not (true/false)"
    },
    {
        "name": "summary",
        "type": "string",
        "description": "Short summary or gist of the text/audio content"
    }
]

TEXT_ANALYZER_SCHEMA = {
    "name": "fsi-kyc-text-analyzer",
    "description": "KYC Analyzer for extracting critical fields from text documents",
    "scenario": "text",
    "fields": KYC_COMMON_FIELDS
}

CONVERSATION_ANALYZER_SCHEMA = {
    "name": "fsi-kyc-audio-anaylzer",
    "description": "KYC Analyzer for extracting critical fields from audio conversations",
    "scenario": "conversation",
    "fields": KYC_COMMON_FIELDS
}

# Initialize Azure Client
Initialize the Azure Content Understanding client and create both text and conversation analyzers.

In [5]:
from content_understanding import ContentUnderstandingClient

# Initialize Azure Content Understanding client
client = ContentUnderstandingClient()

# Set up both analyzers
client.create_analyzer(TEXT_ANALYZER_SCHEMA)
client.create_analyzer(CONVERSATION_ANALYZER_SCHEMA)

{'analyzerId': 'fsi-kyc-audio-anaylzer',
 'description': 'KYC Analyzer for extracting critical fields from audio conversations',
 'createdAt': '2025-02-22T11:21:27Z',
 'lastModifiedAt': '2025-02-22T11:21:27Z',
 'config': {'returnDetails': True, 'disableContentFiltering': False},
 'fieldSchema': {'fields': {'full_name': {'type': 'string',
    'method': 'generate',
    'description': "Customer's full name"},
   'birthdate': {'type': 'string',
    'method': 'generate',
    'description': 'Date of birth extracted from the text'},
   'nationality': {'type': 'string',
    'method': 'generate',
    'description': 'Nationality or citizenship'},
   'affiliations': {'type': 'array',
    'description': 'Companies, organizations, or boards the individual is associated with',
    'items': {'type': 'object',
     'properties': {'Company': {'type': 'string'},
      'Position': {'type': 'string'},
      'EntryYear': {'type': 'string'}}}},
   'legal_issues': {'type': 'array',
    'description': 'Legal 

# Process Text Content
Set up the list of tech leaders and implement the Wikipedia content extraction and analysis loop.

In [6]:
import pandas as pd
from utils import get_wikipedia_content, extract_info_from_result

results = []
people = [
    "Satya Nadella",
    "Mustafa Suleyman",
    "Sam Altman",
    "Alain Berset" 
]

for person in people:
    content = get_wikipedia_content(person)
    if content:
        analysis_result = client.analyze_content(TEXT_ANALYZER_SCHEMA["name"], content)
        extracted = extract_info_from_result(analysis_result)
        if extracted:
            results.append(extracted)

# Create a DataFrame from the results
df_results = pd.DataFrame(results)
df_results


Unnamed: 0,Full Name,DOB,Nationality,Affiliations,Legal Issues,PEP?,Summary
0,Satya Narayana Nadella,19 August 1967,American,"- Microsoft: CEO (2014), - Microsoft: Chairman...",None reported,False,Satya Nadella is an American business executiv...
1,Mustafa Suleyman,August 1984,British,"- Microsoft AI: CEO (Unknown), - DeepMind: Co-...",None reported,False,"Mustafa Suleyman is a British AI entrepreneur,..."
2,Samuel Harris Altman,"April 22, 1985",American,"- OpenAI: Chief Executive Officer (2019), - Ok...",None reported,False,Samuel Harris Altman is an American entreprene...
3,Alain Berset,9 April 1972,Swiss,"- Swiss Federal Council: Member (2012), - Fede...",None reported,True,Alain Berset is a Swiss politician who served ...


# Process Audio Content 
Download and process the audio interview file using the conversation analyzer.

In [7]:
from utils import download_audio, extract_info_from_result
import logging
from pathlib import Path


audio_people = [
    "Sal Khan"
]

# URL of Sal Khan's audio interview
audio_url = "https://traffic.libsyn.com/secure/force-cdn/highwinds/behindthetech/127299_BTT_Series_2024_Khan_v1.1_MIXEDAudio_-16LKFS-128MONO.mp3?stats-code=EP66-Sal-Khan"
audio_path = Path("audio_files/sal_khan_interview.mp3")

# Download the audio
if download_audio(audio_url, audio_path):
    with open(audio_path, "rb") as f:
        audio_result = client.analyze_content(CONVERSATION_ANALYZER_SCHEMA["name"], f)
        
        # Now extract the info exactly as we do for text
        extracted_audio_info = extract_info_from_result(audio_result)
        
        if extracted_audio_info:
            results.append(extracted_audio_info)

# Create a DataFrame from the results
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Full Name,DOB,Nationality,Affiliations,Legal Issues,PEP?,Summary
0,Satya Narayana Nadella,19 August 1967,American,"- Microsoft: CEO (2014), - Microsoft: Chairman...",None reported,False,Satya Nadella is an American business executiv...
1,Mustafa Suleyman,August 1984,British,"- Microsoft AI: CEO (Unknown), - DeepMind: Co-...",None reported,False,"Mustafa Suleyman is a British AI entrepreneur,..."
2,Samuel Harris Altman,"April 22, 1985",American,"- OpenAI: Chief Executive Officer (2019), - Ok...",None reported,False,Samuel Harris Altman is an American entreprene...
3,Alain Berset,9 April 1972,Swiss,"- Swiss Federal Council: Member (2012), - Fede...",None reported,True,Alain Berset is a Swiss politician who served ...
4,Sal Khan,,,"- Khan Academy: Founder and CEO (Unknown), - M...",None reported,False,"Sal Khan, founder and CEO of Khan Academy, dis..."


# Process PDF / Video Files 

👉 PDF & Audio files work similarly, see: https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/video/overview & https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/document/overview

# Write data to CSV

In [9]:
import json

with open("../../data/kyc_results.jsonl", "w", encoding="utf-8") as f:
    for _, row in df_results.iterrows():
        affiliations = [aff.strip("- ") for aff in row["Affiliations"].split(", -")]
        
        record = {
            "full_name": row["Full Name"],
            "birth_date": row["DOB"] or "Unknown",
            "nationality": row["Nationality"] or "Unknown",
            "affiliations": [
                {
                    "company": a.split(": ")[0],
                    "role": a.split(": ")[1].split(" (")[0] if ": " in a else "",
                    "year": a.split("(")[1].rstrip(")") if "(" in a else "Unknown"
                }
                for a in affiliations if ": " in a
            ],
            "legal_issues": [] if row["Legal Issues"] == "None reported" else [row["Legal Issues"]],
            "political_exposure": row["PEP?"].lower() == "true",
            "summary": row["Summary"]
        }
        
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
