In [None]:
# 1. IMPORT LIBRARIES
import requests
import json
import csv
import pandas as pd

#### The portal (https://transparencia.sns.gov.pt) is an Opendatasoft Explore portal.
- Dataset1 intervencoes-cirurgicas (number of surgeries, by hospital) is hosted on the Transparência SNS portal under <domain> transparencia.sns.gov.pt 

In [None]:
# 2.1 USE Records API 1.0 TO RETRIEVE DATA
# dataset1: "intervencoes-cirurgicas"

BASE_URL = "https://transparencia.sns.gov.pt/api/records/1.0/search/"  
ROWS = 1000
start = 0
all_records = []

while True:
    params = {
        "dataset": "intervencoes-cirurgicas",  
        "rows": ROWS,                           
        "start": start                          
    }
    response = requests.get(BASE_URL, params=params)  
    response.raise_for_status()
    data = response.json()
    
    records = data.get("records", [])
    if not records:
        break
    
    all_records.extend(records)
    print(f"Fetched {len(records)} records, total so far: {len(all_records)}")
    
    start += ROWS

# 3.1 SAVE TO CSV
if all_records:
    fieldnames = all_records[0]["fields"].keys()  
    
    with open("intervencoes_cirurgicas_api.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for rec in all_records:
            writer.writerow(rec["fields"])

print("Data saved to intervencoes_cirurgicas_api.csv")


Fetched 1000 records, total so far: 1000
Fetched 1000 records, total so far: 2000
Fetched 1000 records, total so far: 3000
Fetched 1000 records, total so far: 4000
Fetched 1000 records, total so far: 5000
Fetched 1000 records, total so far: 6000
Fetched 999 records, total so far: 6999
Data saved to intervencoes_cirurgicas_api.csv


In [None]:
# 2.2 USE Records API 1.0 TO RETRIEVE DATA
# dataset2: "inscritos-em-lic-dentro-do-tmrg-180-dias@spms"  

BASE_URL = "https://transparencia.sns.gov.pt/api/records/1.0/search/"
DATASET_ID = "inscritos-em-lic-dentro-do-tmrg-180-dias@spms" 
ROWS = 1000  
start = 0
all_records = []

while True:
    params = {
        "dataset": DATASET_ID,
        "rows": ROWS,
        "start": start
    }
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()
    data = response.json()
    
    records = data.get("records", [])
    if not records:
        break
    
    all_records.extend(records)
    print(f"Fetched {len(records)} records, total so far: {len(all_records)}")
    
    start += ROWS

print("Total records fetched:", len(all_records))

# 3.2 SAVE TO CSV
if all_records:
    fieldnames = all_records[0]["fields"].keys()
    with open("inscritos_lic_tmrg_180_api.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for rec in all_records:
            writer.writerow(rec["fields"])

print("Saved to inscritos_lic_tmrg_180_api.csv")


Fetched 1000 records, total so far: 1000
Fetched 1000 records, total so far: 2000
Fetched 1000 records, total so far: 3000
Fetched 1000 records, total so far: 4000
Fetched 215 records, total so far: 4215
Total records fetched: 4215
Saved to inscritos_lic_tmrg_180_api.csv


In [None]:
# 4. LOADS THE CSV INTO A DATAFRAME
df_surgeries = pd.read_csv("intervencoes_cirurgicas_api.csv")

# 5.1 CHECK BASIC INFO about the dataset
print("=== Dataset Info ===")
print(df_surgeries.info())  

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999 entries, 0 to 6998
Data columns (total 8 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   no_intervencoes_cirurgicas_urgentes        6614 non-null   float64
 1   localizacao_geografica                     6999 non-null   object 
 2   regiao                                     6999 non-null   object 
 3   no_intervencoes_cirurgicas_convencionais   6727 non-null   float64
 4   instituicao                                6999 non-null   object 
 5   tempo                                      6999 non-null   object 
 6   no_intervencoes_cirurgicas_programadas     6999 non-null   int64  
 7   no_intervencoes_cirurgicas_de_ambulatorio  6999 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 437.6+ KB
None


In [None]:
# 5.2 Check dataset shape 
print("\n=== Shape ===")
print(df_surgeries.shape)


=== Shape ===
(6999, 8)


In [7]:
# 5.3 Check column names
print("\n=== Column Names ===")
print(df_surgeries.columns.tolist())


=== Column Names ===
['no_intervencoes_cirurgicas_urgentes', 'localizacao_geografica', 'regiao', 'no_intervencoes_cirurgicas_convencionais', 'instituicao', 'tempo', 'no_intervencoes_cirurgicas_programadas', 'no_intervencoes_cirurgicas_de_ambulatorio']


In [8]:
# 5.4 Count each data type
print(df_surgeries.dtypes.value_counts())


object     4
float64    2
int64      2
Name: count, dtype: int64


In [9]:
# 5.5 Check data types of each column
print("\n=== Data Types ===")
print(df_surgeries.dtypes)


=== Data Types ===
no_intervencoes_cirurgicas_urgentes          float64
localizacao_geografica                        object
regiao                                        object
no_intervencoes_cirurgicas_convencionais     float64
instituicao                                   object
tempo                                         object
no_intervencoes_cirurgicas_programadas         int64
no_intervencoes_cirurgicas_de_ambulatorio      int64
dtype: object


In [10]:
# 5.6 Preview the first few rows
print("\n=== Head of Dataset ===")
print(df_surgeries.head())


=== Head of Dataset ===
   no_intervencoes_cirurgicas_urgentes    localizacao_geografica  \
0                                 87.0   [38.014123, -7.8721227]   
1                                110.0  [39.3003425, -7.4263845]   
2                                  NaN  [40.2162514, -8.4103814]   
3                                 99.0  [40.5309916, -7.2734793]   
4                                194.0  [38.6658713, -9.1822554]   

                        regiao  no_intervencoes_cirurgicas_convencionais  \
0  Região de Saúde do Alentejo                                     212.0   
1  Região de Saúde do Alentejo                                     235.0   
2    Região de Saúde do Centro                                     293.0   
3    Região de Saúde do Centro                                     296.0   
4          Região de Saúde LVT                                     440.0   

                                       instituicao    tempo  \
0    Unidade Local de Saúde do Baixo Alentejo,

In [11]:
# 6.1 CHECK FOR MISSING VALUES
print("\n=== Missing Values ===")
print(df_surgeries.isnull().sum())


=== Missing Values ===
no_intervencoes_cirurgicas_urgentes          385
localizacao_geografica                         0
regiao                                         0
no_intervencoes_cirurgicas_convencionais     272
instituicao                                    0
tempo                                          0
no_intervencoes_cirurgicas_programadas         0
no_intervencoes_cirurgicas_de_ambulatorio      0
dtype: int64


In [None]:
# 6.2 Filter rows where the var "no_intervencoes_cirurgicas_urgentes" is null
missing_df = df_surgeries[df_surgeries["no_intervencoes_cirurgicas_urgentes"].isna()]  

# Show the unique hospitals with missing values
missing_instituicoes = missing_df["instituicao"].unique()

print("Hospitals with missing 'no_intervencoes_cirurgicas_urgentes':")
print(missing_instituicoes)


Hospitals with missing 'no_intervencoes_cirurgicas_urgentes':
['Instituto Português Oncologia de Coimbra, EPE'
 'Hospital Arcebispo João Crisóstomo' 'Instituto Gama Pinto'
 'Hospital José Luciano de Castro' 'Hospital Dr. Francisco Zagalo']


In [None]:
# 6.3 Filter rows where the var "no_intervencoes_cirurgicas_convencionais" is null
missing_df = df_surgeries[df_surgeries["no_intervencoes_cirurgicas_convencionais"].isna()]  

# Show the unique hospitals with missing values
missing_instituicoes = missing_df["instituicao"].unique()

print("Hospitals with missing 'no_intervencoes_cirurgicas_convencionais':")
print(missing_instituicoes)

Hospitals with missing 'no_intervencoes_cirurgicas_convencionais':
['Hospital Arcebispo João Crisóstomo' 'Instituto Gama Pinto'
 'Hospital José Luciano de Castro']


In [None]:
# 6.4 Filter: remove hospitals that do not offer the full range of surgeries 

# List of hospitals to remove
excluded_hospitals = [
    "Hospital Dr. Francisco Zagalo",
    "Instituto Gama Pinto",
    "Hospital Arcebispo João Crisóstomo",
    "Hospital José Luciano de Castro",
    'Instituto Português Oncologia de Coimbra, EPE',
]

# Drop rows: keep only the rows where "instituicao" is NOT in the list hospitais:
df_filtered = df_surgeries[~df_surgeries["instituicao"].isin(excluded_hospitals)]

In [15]:
# 6.5 After cleaning, there are no missing values
print("\n=== Missing Values ===")
print(df_filtered.isnull().sum())


=== Missing Values ===
no_intervencoes_cirurgicas_urgentes          0
localizacao_geografica                       0
regiao                                       0
no_intervencoes_cirurgicas_convencionais     0
instituicao                                  0
tempo                                        0
no_intervencoes_cirurgicas_programadas       0
no_intervencoes_cirurgicas_de_ambulatorio    0
dtype: int64


In [16]:
# 6.6 Verify cleaned dataset shape (rows, columns)
print("\n=== Shape ===")
print(df_filtered.shape)


=== Shape ===
(6447, 8)


In [None]:
# 7. DATA TYPE CONVERSION 
df_filtered.loc[:,"no_intervencoes_cirurgicas_convencionais"] = df_filtered["no_intervencoes_cirurgicas_convencionais"].astype('Int64')
df_filtered.loc[:,"no_intervencoes_cirurgicas_urgentes"] = df_filtered["no_intervencoes_cirurgicas_urgentes"].astype('Int64')

# Verify
print(df_filtered.dtypes)

no_intervencoes_cirurgicas_urgentes           Int64
localizacao_geografica                       object
regiao                                       object
no_intervencoes_cirurgicas_convencionais      Int64
instituicao                                  object
tempo                                        object
no_intervencoes_cirurgicas_programadas        int64
no_intervencoes_cirurgicas_de_ambulatorio     int64
dtype: object


In [None]:
# 8.1 CHECK UNIQUE VALUES

# DEFINE CATEGORICAL VAR WITH A LIST
categorical_vars = df_filtered.select_dtypes(include=['object']).columns.tolist()
# Count unique values in each categorical var
print("\n=== Unique Values per Column ===")
print(f"{categorical_vars}: {df_filtered[categorical_vars].nunique()} unique values")


=== Unique Values per Column ===
['localizacao_geografica', 'regiao', 'instituicao', 'tempo']: localizacao_geografica     85
regiao                      5
instituicao                83
tempo                     153
dtype: int64 unique values


In [None]:
# 8.2 Check unique values in each categorical var
for col in categorical_vars:
    print(col, "→", df_filtered[col].unique())

localizacao_geografica → ['[38.014123, -7.8721227]' '[39.3003425, -7.4263845]'
 '[40.5309916, -7.2734793]' '[38.6658713, -9.1822554]'
 '[41.1831056, -8.6010558]' '[41.1472498, -8.6172617]'
 '[41.56785, -8.398982]' '[38.5684861, -7.9032848]'
 '[40.6362453, -8.6543716]' '[40.117897, -8.8586755]'
 '[38.708454, -9.216985]' '[39.4054629, -9.1258493]'
 '[41.4387173, -8.3086907]' '[41.1094138, -8.5972617]'
 '[41.8056532, -6.7888197]' '[37.0238973, -7.9281554]'
 '[40.2804158, -7.4922407]' '[38.7422257, -9.2264834]'
 '[41.1785642, -8.6062881]' '[41.1794456, -8.6745115]'
 '[38.6554005, -9.0592392]' '[38.7166948, -9.1371248]'
 '[38.5283754, -8.8818638]' '[38.729006, -9.418267]'
 '[41.2494561, -8.264901]' '[41.7056054, -8.8252713]'
 '[38.7500861, -9.1616116]' '[38.821455, -9.176296]'
 '[38.977235, -8.984506]' '[41.3828711, -8.7589945]'
 '[38.0162314, -8.6956768]' '[40.516364, -8.0827433]'
 '[39.2410946, -8.6937515]' '[41.3423984, -8.4802503]'
 '[41.3031784, -7.7515252]' '[40.9293884, -8.5458794]'


In [None]:
# 9. Save updated dataset
df_filtered.to_csv("intervencoes_cirurgicas_cleaned.csv", index=False)
df_filtered.to_excel("intervencoes_cirurgicas_cleaned.xlsx", index=False)

In [None]:
# 10. FREQUENCIES
# List of surgery columns
surgery_cols = [
    "no_intervencoes_cirurgicas_convencionais",
    "no_intervencoes_cirurgicas_urgentes",
    "no_intervencoes_cirurgicas_programadas",
    "no_intervencoes_cirurgicas_de_ambulatorio"
]

# Group by region and sum
region_summary = df_filtered.groupby("regiao")[surgery_cols].sum()
region_summary["total_surgeries"] = region_summary.sum(axis=1)

# Total surgeries across all regions
totals = region_summary.sum()

# Add percentage columns
for col in surgery_cols:
    region_summary[col + "_pct"] = (region_summary[col] / totals[col] * 100).round(1)
region_summary["total_pct"] = (region_summary["total_surgeries"] / totals["total_surgeries"] * 100).round(1)

# Add Total row
total_row = pd.DataFrame(region_summary.sum()).T
total_row.index = ["Total"]
for col in surgery_cols:
    total_row[col + "_pct"] = 100.0
total_row["total_pct"] = 100.0
region_summary = pd.concat([region_summary, total_row])

# Sort by total surgeries (Total row at bottom)
sorted_regions = region_summary.drop("Total").sort_values("total_surgeries", ascending=False).index
region_summary = region_summary.loc[list(sorted_regions) + ["Total"]]

# Format numbers and percentages
formatted = region_summary.copy()

# Format numbers with thousands separator
for col in surgery_cols + ["total_surgeries"]:
    formatted[col] = formatted[col].apply(lambda x: f"{int(x):,}")

# Format percentages with 1 decimal + %
pct_cols = [c + "_pct" for c in surgery_cols] + ["total_pct"]
for col in pct_cols:
    formatted[col] = formatted[col].apply(lambda x: f"{x:.1f}%")

# Move region column first
formatted.insert(0, "Região", formatted.index)

# Optional: reorder columns for better presentation
display_cols = ["Região"]
for col in surgery_cols:
    display_cols.extend([col, col + "_pct"])
display_cols.extend(["total_surgeries", "total_pct"])
formatted = formatted[display_cols]

# Display table
print(formatted)


                                                  Região  \
Região de Saúde Norte              Região de Saúde Norte   
Região de Saúde LVT                  Região de Saúde LVT   
Região de Saúde do Centro      Região de Saúde do Centro   
Região de Saúde do Alentejo  Região de Saúde do Alentejo   
Região de Saúde do Algarve    Região de Saúde do Algarve   
Total                                              Total   

                            no_intervencoes_cirurgicas_convencionais  \
Região de Saúde Norte                                      7,538,517   
Região de Saúde LVT                                        5,476,855   
Região de Saúde do Centro                                  3,020,210   
Região de Saúde do Alentejo                                  733,192   
Região de Saúde do Algarve                                   390,457   
Total                                                     17,159,231   

                            no_intervencoes_cirurgicas_convencionais_pct  