In [264]:
import os
import pandas as pd
import numpy as np
import zipfile

In [265]:
data_path = f"Data"
raw_data = os.path.join(data_path, f"Raw Data")
preprocess_path = os.path.join(data_path, f"Preprocessed Data")
analysis_path = os.path.join(data_path, f"Analysis Data") 
results_path = f"Results"

# Q1 Knowledge Graph

In [266]:
aqi_data = pd.read_csv(os.path.join(preprocess_path, "aqi_data.csv"))

In [267]:
aqi_data

Unnamed: 0,FIPS,State,County,State Code,County Code,Year,RUCC,Population,Max AQI,90th Percentile AQI,Median AQI
0,6021,California,Glenn,6,21,2018,6.0,27899,230,69,36
1,6023,California,Humboldt,6,23,2018,5.0,136502,163,53,34
2,6025,California,Imperial,6,25,2018,3.0,181062,286,108,62
3,6029,California,Kern,6,29,2018,2.0,893618,190,142,84
4,6033,California,Lake,6,33,2018,4.0,64394,208,43,33
...,...,...,...,...,...,...,...,...,...,...,...
128,42111,Pennsylvania,Somerset,42,111,2023,6.0,72197,93,58,41
129,42117,Pennsylvania,Tioga,42,117,2023,8.0,40840,100,74,44
130,48113,Texas,Dallas,48,113,2023,1.0,2606358,172,74,47
131,48221,Texas,Hood,48,221,2023,4.0,67774,105,58,38


In [271]:
''' Subject: <County_Year> 
Predictes and Objects: 
1. hasState: State name
2. hasRUCC: RUCC
3. hasPopulation: Population Estimate
4. hasMaxAQI: Max AQI
 '''
from rdflib import Graph, Literal, RDF, URIRef, Namespace

EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in aqi_data.iterrows():
    county_year = EX[f"{row['County'].replace(' ', '_')}_{row['Year']}"]
    state = EX[f"State#{row['State'].replace(' ', '_')}"]
    county = EX[f"County#{row['County'].replace(' ', '_')}"]
    
    rucc = EX[f"RUCC#{row['RUCC']}"]
    population = EX[f"Population#{row['Population']}"]
    aqi = EX[f"MaxAQI#{row['Max AQI']}"]

    g.add((state, EX.hasState, county))
    g.add((county, EX.hasRUCC, rucc))
    g.add((county_year, EX.hasCounty, county))
    g.add((county_year, EX.hasPopulation, population))
    g.add((county_year, EX.hasMedianAQI, aqi))

# Serialize the RDF graph to Turtle format
# print(g.serialize(format='turtle'))

In [269]:
from pyvis.network import Network
net = Network(height='750px', 
              width='100%', 
              directed=False, 
              neighborhood_highlight=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]

    subject_type = str(subject).split("#")[-2]
    predicate_type = str(predicate).split("#")[-2]
    object_type = str(object).split("#")[-2]

    
        
    if subject_label not in net.node_ids:
        if "ke" in subject_type:
            subject_title = subject_label
        else:
            subject_title = subject_type + ": " + subject_label
            
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        if "ke" in object_type:
            object_title = object_label
        else:
            object_title = object_type + ": " + object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasState': 'red',
        'hasCounty': 'orange',
        'hasPopulation': 'green',
        'hasMedianAQI': 'blue',
        'hasRUCC': 'black'
    }.get(predicate_label, 'grey')

    
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "Q1.html"))
net.show("Q1.html", local=False, notebook=False)

Q1.html


# Q3 Knowledge Graph

In [273]:
mobility_data = pd.read_csv(os.path.join(preprocess_path, "merged_pop_mobility.csv"))

In [274]:
''' Subject: <County_URI> 
Predictes and Objects: 
1. hasState: State name
2. hasRUCC: RUCC
3. hasPopulation: Population Estimate
4. hasMaxAQI: Max AQI
5. hasVMT: Vehicle Miles Traveled
 '''
from rdflib import Graph, Literal, RDF, URIRef, Namespace

EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in mobility_data.iterrows():
    county_year = EX[f"{row['County'].replace(' ', '_')}_{row['Year']}"]
    state = EX[f"State#{row['State'].replace(' ', '_')}"]
    county = EX[f"County#{row['County'].replace(' ', '_')}"]
    
    rucc = EX[f"RUCC#{row['RUCC']}"]
    population = EX[f"Population#{row['Population']}"]
    aqi = EX[f"MaxAQI#{row['Max AQI']}"]

    vmt = EX[f"VMT#{row['vmt']}"]

    g.add((county, EX.hasState, state))
    g.add((county, EX.hasRUCC, rucc))
    g.add((county_year, EX.hasCounty, county))
    g.add((county_year, EX.hasPopulation, population))
    g.add((county_year, EX.hasMaxAQI, aqi))
    g.add((county_year, EX.hasVMT, vmt))

# Serialize the RDF graph to Turtle format
print(g.serialize(format='turtle'))

@prefix ns1: <http://example.org/ke#> .

ns1:Dallas_2018 ns1:hasCounty <http://example.org/ke#County#Dallas> ;
    ns1:hasMaxAQI <http://example.org/ke#MaxAQI#161> ;
    ns1:hasPopulation <http://example.org/ke#Population#2629764> ;
    ns1:hasVMT <http://example.org/ke#VMT#11826623292.0> .

ns1:Dallas_2019 ns1:hasCounty <http://example.org/ke#County#Dallas> ;
    ns1:hasMaxAQI <http://example.org/ke#MaxAQI#143> ;
    ns1:hasPopulation <http://example.org/ke#Population#2635603> ;
    ns1:hasVMT <http://example.org/ke#VMT#11523452066.0> .

ns1:Dallas_2020 ns1:hasCounty <http://example.org/ke#County#Dallas> ;
    ns1:hasMaxAQI <http://example.org/ke#MaxAQI#135> ;
    ns1:hasPopulation <http://example.org/ke#Population#2610112> ;
    ns1:hasVMT <http://example.org/ke#VMT#10764144411.0> .

ns1:Hood_2018 ns1:hasCounty <http://example.org/ke#County#Hood> ;
    ns1:hasMaxAQI <http://example.org/ke#MaxAQI#133> ;
    ns1:hasPopulation <http://example.org/ke#Population#60249> ;
    ns1:hasVMT <h

In [99]:
from pyvis.network import Network
net = Network(height='750px', width='100%', directed=True, neighborhood_highlight=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]

    subject_type = str(subject).split("#")[-2]
    predicate_type = str(predicate).split("#")[-2]
    object_type = str(object).split("#")[-2]

    
        
    if subject_label not in net.node_ids:
        if "ke" in subject_type:
            subject_title = subject_label
        else:
            subject_title = subject_type + ": " + subject_label
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        if "ke" in object_type:
            object_title = object_label
        else:
            object_title = object_type + ": " + object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasState': 'red',
        'hasCounty': 'orange',
        'hasPopulation': 'green',
        'hasMaxAQI': 'blue',
        'hasRUCC': 'black',
        'hasVMT': 'pink',
    }.get(predicate_label, 'grey')

    
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "Q3.html"))
net.show("Q3.html", local=False, notebook=False)

Q3.html


# Q2 Knowledge Graph

In [275]:
pollutant_data = pd.read_csv(os.path.join(preprocess_path, "pollutant_population_aqi.csv"))

In [276]:
pollutant_data.head()

Unnamed: 0,Year,State,County,County Code,State Code,RUCC,FIPS,Pollutant concentration (PPM),State Name Code,County Full Name,Population,Pollutant,Max AQI
0,2018,California,Alameda,1,6,1.0,6001,0.616945,CA,Alameda County,1666596,CO,150.0
1,2018,California,Butte,7,6,3.0,6007,0.468219,CA,Butte County,230330,CO,150.0
2,2018,California,Contra Costa,13,6,1.0,6013,0.428311,CA,Contra Costa County,1150840,CO,150.0
3,2018,California,Fresno,19,6,1.0,6019,0.379499,CA,Fresno County,991298,CO,150.0
4,2018,California,Humboldt,23,6,5.0,6023,0.232778,CA,Humboldt County,136502,CO,150.0


In [280]:
''' Subject: <County_URI> 
Predictes and Objects: 
1. hasState: State name
2. hasRUCC: RUCC
3. hasPopulation: Population Estimate
4. hasMedianAQI: Median AQI
 '''
from rdflib import Graph, Literal, RDF, URIRef, Namespace

EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in pollutant_data.iterrows():
    county_year = EX[f"{row['County'].replace(' ', '_')}_{row['Year']}"]
    state = EX[f"State#{row['State'].replace(' ', '_')}"]
    county = EX[f"County#{row['County'].replace(' ', '_')}"]
    
    rucc = EX[f"RUCC#{row['RUCC']}"]
    aqi = EX[f"AQI#{row['Max AQI']}"]
    population = EX[f"Population#{row['Population']}"]
    pollutant_county = EX[f"Pollutant#{row['County'].replace(' ', '_')}_{row['Year']}_{row['Pollutant']}"]
    conc = EX[f"PollutantConcentration#{row['Pollutant concentration (PPM)']}"]
    
    g.add((state, EX.hasState, county))
    g.add((county, EX.hasRUCC, rucc))
    g.add((county_year, EX.hasCounty, county))
    g.add((county_year, EX.hasPopulation, population))
    g.add((county_year, EX.hasMaxAQI, aqi))
    g.add((county_year, EX.hasPollutant, pollutant_county))
    g.add((pollutant_county, EX.hasConcentration, conc))

print(g.serialize(format='turtle'))

@prefix ns1: <http://example.org/ke#> .

ns1:Adams_2018 ns1:hasCounty <http://example.org/ke#County#Adams> ;
    ns1:hasMaxAQI <http://example.org/ke#AQI#150.0> ;
    ns1:hasPollutant <http://example.org/ke#Pollutant#Adams_2018_CO>,
        <http://example.org/ke#Pollutant#Adams_2018_NO2>,
        <http://example.org/ke#Pollutant#Adams_2018_PM2.5> ;
    ns1:hasPopulation <http://example.org/ke#Population#103035> .

ns1:Adams_2019 ns1:hasCounty <http://example.org/ke#County#Adams> ;
    ns1:hasMaxAQI <http://example.org/ke#AQI#85.0> ;
    ns1:hasPollutant <http://example.org/ke#Pollutant#Adams_2019_CO>,
        <http://example.org/ke#Pollutant#Adams_2019_NO2>,
        <http://example.org/ke#Pollutant#Adams_2019_PM2.5> ;
    ns1:hasPopulation <http://example.org/ke#Population#102776> .

ns1:Adams_2020 ns1:hasCounty <http://example.org/ke#County#Adams> ;
    ns1:hasMaxAQI <http://example.org/ke#AQI#86.0> ;
    ns1:hasPollutant <http://example.org/ke#Pollutant#Adams_2020_CO>,
        <http

In [278]:
from pyvis.network import Network
net = Network(height='750px', width='100%', directed=True, neighborhood_highlight=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]
    
    subject_type = str(subject).split("#")[-2]
    object_type = str(object).split("#")[-2]
    print(subject_label, predicate_label, object_label)    
    if subject_label not in net.node_ids:
        if "ke" in subject_type:
            subject_title = subject_label
        else:
            subject_title = subject_type + ": " + subject_label
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        if "ke" in object_type:
            object_title = object_label
        else:
            object_title = object_type + ": " + object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasState': 'red',
        'hasRUCC': 'black',
        'hasCounty': 'orange',
        'hasPopulation': 'green',
        'hasMaxAQI': 'blue',
        'hasPollutant': 'pink',
        'hasConcentration': 'purple',
    }.get(predicate_label, 'grey')
        
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "Q2.html"))
net.show("Q2.html", local=False, notebook=False)

Contra_Costa_2023 hasPollutant Contra_Costa_2023_NO2
Alameda_2022 hasPollutant Alameda_2022_NO2
Dallas_2020 hasMaxAQI 135.0
Bexar_2021_NO2 hasConcentration 10.794186046511628
Fresno_2021_PM2.5 hasConcentration 12.830191404839294
Cambria_2018 hasPopulation 131478
Fresno_2023_NO2 hasConcentration 17.91797806390081
Harris_2020 hasPopulation 4734792
Lackawanna_2021 hasMaxAQI 137.0
Humboldt_2019 hasCounty Humboldt
Kern_2019_CO hasConcentration 0.3266483516483516
Fresno_2022 hasCounty Fresno
Butte_2022 hasPollutant Butte_2022_NO2
Cambria_2022_NO2 hasConcentration 13.659604519774012
York_2021_PM2.5 hasConcentration 10.304755043227663
Lackawanna_2022 hasPollutant Lackawanna_2022_PM2.5
Texas hasState Bexar
Alameda_2018_NO2 hasConcentration 22.769899183869416
El_Paso_2023 hasPopulation 869880
Cambria_2023 hasPollutant Cambria_2023_PM2.5
Harrison_2019 hasPollutant Harrison_2019_NO2
Jefferson_2020 hasCounty Jefferson
Imperial_2022 hasPollutant Imperial_2022_NO2
Contra_Costa_2020 hasPollutant Contr

# Correlations

In [281]:
pop_pol_corr = pd.read_csv(os.path.join(analysis_path, "RUCC_Population_Pollutant_Correlation.csv"), dtype={'RUCC': int, 'Year': int})
vmt_aqi_corr = pd.read_csv(os.path.join(analysis_path, "RUCC_VMT_AQI_Correlation.csv"), dtype={'RUCC': int, 'Year': int})
pop_aqi_corr = pd.read_csv(os.path.join(analysis_path, "RUCC_Population_AQI.csv"), dtype={'RUCC': int, 'Year': int})

In [282]:
df1 = vmt_aqi_corr[['RUCC', 'Year', 'VMT_Correlation']]
df1 = df1.fillna(0)

df2 = pop_pol_corr[['RUCC', 'Year', 'Pollutant', 'Correlation']]
df2 = df2.rename(columns={'Correlation': 'Pollutant_Correlation'})
df2 = df2.fillna(0)

df3 = pop_pol_corr[['RUCC', 'Year', 'Correlation']]
df3 = df3.rename(columns={'Correlation': 'AQI_Correlation'})
df3 = df3.fillna(0)

## Correlation - VMT vs AQI (Question 3)

In [254]:
''' Correlation 1 - VMT vs AQI'''
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import XSD
EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in df1.iterrows():
    rucc_year = EX[f"{row['RUCC']}_{row['Year']}"]
    rucc = Literal(row['RUCC'], datatype=XSD.integer)
    year = Literal(row['Year'], datatype=XSD.integer)
    vmt_corr = Literal(row['VMT_Correlation'], datatype=XSD.float)#EX[f"VMT_Correlation#{row['VMT_Correlation']}"]
    
    g.add((rucc_year, EX.hasRUCC, rucc))
    g.add((rucc_year, EX.ofYear, year))
    g.add((rucc_year, EX.hasVMTCorrelation, vmt_corr))
    

In [255]:
for subj, pred, obj in g:
    print(subj, pred, obj)

http://example.org/ke#9.0_2018.0 http://example.org/ke#ofYear 2018.0
http://example.org/ke#1.0_2018.0 http://example.org/ke#hasRUCC 1.0
http://example.org/ke#1.0_2019.0 http://example.org/ke#hasRUCC 1.0
http://example.org/ke#6.0_2018.0 http://example.org/ke#hasVMTCorrelation -0.2484526199789098
http://example.org/ke#2.0_2019.0 http://example.org/ke#hasRUCC 2.0
http://example.org/ke#5.0_2019.0 http://example.org/ke#hasVMTCorrelation 0.0
http://example.org/ke#6.0_2019.0 http://example.org/ke#ofYear 2019.0
http://example.org/ke#6.0_2018.0 http://example.org/ke#ofYear 2018.0
http://example.org/ke#2.0_2020.0 http://example.org/ke#ofYear 2020.0
http://example.org/ke#2.0_2018.0 http://example.org/ke#hasRUCC 2.0
http://example.org/ke#3.0_2020.0 http://example.org/ke#hasRUCC 3.0
http://example.org/ke#9.0_2018.0 http://example.org/ke#hasRUCC 9.0
http://example.org/ke#3.0_2018.0 http://example.org/ke#ofYear 2018.0
http://example.org/ke#7.0_2019.0 http://example.org/ke#hasRUCC 7.0
http://example.o

In [257]:
from rdflib.plugins.sparql import prepareQuery

# Define the query
query_str = """
PREFIX ex: <http://example.org/ke#>

SELECT ?rucc ?year ?vmt_corr
WHERE {
  ?rucc_year ex:hasRUCC ?rucc ;
             ex:ofYear ?year ;
             ex:hasVMTCorrelation ?vmt_corr .
}
"""

# Prepare the query and execute it
query = prepareQuery(query_str, initNs={"ex": EX})
results = g.query(query)

# Print the results
for row in results:
    rucc = row['rucc']
    year = row['year']
    vmt_corr = row['vmt_corr']
    print(f"RUCC: {rucc}, Year: {year}, VMT Correlation: {vmt_corr}")


RUCC: 6.0, Year: 2018.0, VMT Correlation: -0.2484526199789098
RUCC: 6.0, Year: 2019.0, VMT Correlation: -0.1012235226054889
RUCC: 6.0, Year: 2020.0, VMT Correlation: 0.0682661834648491
RUCC: 5.0, Year: 2018.0, VMT Correlation: 0.0
RUCC: 5.0, Year: 2019.0, VMT Correlation: 0.0
RUCC: 5.0, Year: 2020.0, VMT Correlation: 0.0
RUCC: 3.0, Year: 2018.0, VMT Correlation: 0.9787294868637744
RUCC: 3.0, Year: 2019.0, VMT Correlation: 0.8598203210503591
RUCC: 3.0, Year: 2020.0, VMT Correlation: 0.4851206239795099
RUCC: 2.0, Year: 2018.0, VMT Correlation: 1.0
RUCC: 2.0, Year: 2019.0, VMT Correlation: 1.0
RUCC: 2.0, Year: 2020.0, VMT Correlation: 0.9999999999999998
RUCC: 4.0, Year: 2018.0, VMT Correlation: -0.4525427657343702
RUCC: 4.0, Year: 2019.0, VMT Correlation: -0.8350733790997766
RUCC: 4.0, Year: 2020.0, VMT Correlation: 0.8474233320136187
RUCC: 1.0, Year: 2018.0, VMT Correlation: 0.999662997450859
RUCC: 1.0, Year: 2019.0, VMT Correlation: 0.953222554724765
RUCC: 1.0, Year: 2020.0, VMT Correla

In [258]:
from pyvis.network import Network
net = Network(height='750px', width='100%', directed=True, neighborhood_highlight=True, filter_menu=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]
    
    # subject_type = str(subject).split("#")[-2]
    # object_type = str(object).split("#")[-2]
    if subject_label not in net.node_ids:
        subject_title = subject_label
        # else:
            # subject_title = subject_type + ": " + subject_label
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        object_title = object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasRUCC': 'red',
        'ofYear': 'green',
        'hasVMTCorrelation': 'purple',
    }.get(predicate_label, 'grey')
        
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "kg_corr_1.html"))
net.show("kg_corr_1.html", local=False, notebook=False)

kg_corr_1.html


## Correlation - Population vs Pollutant (Question 2)

In [259]:
''' Correlation 2 - Population vs Pollutant'''
from rdflib import Graph, Literal, RDF, URIRef, Namespace

EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in df2.iterrows():
    rucc_pollutant_year = EX[f"{row['RUCC']}_{row['Pollutant']}_{row['Year']}"]
    rucc = Literal(row['RUCC'], datatype=XSD.integer)
    year = Literal(row['Year'], datatype=XSD.integer)
    pollutant = Literal(row['Pollutant'])
    pollutant_corr = Literal(row['Pollutant_Correlation'])#EX[f"Pollutant_Correlation#{row['Pollutant_Correlation']}"]
    
    g.add((rucc_pollutant_year, EX.hasRUCC, rucc))
    g.add((rucc_pollutant_year, EX.ofYear, year))
    g.add((rucc_pollutant_year, EX.forPollutant, pollutant))
    g.add((rucc_pollutant_year, EX.hasCorrelation, pollutant_corr))
    

In [260]:
for subj, pred, obj in g:
    print(subj, pred, obj)

http://example.org/ke#3_PM2.5_2023 http://example.org/ke#hasRUCC 3
http://example.org/ke#3_PM2.5_2019 http://example.org/ke#forPollutant PM2.5
http://example.org/ke#2_NO2_2019 http://example.org/ke#ofYear 2019
http://example.org/ke#2_NO2_2023 http://example.org/ke#hasCorrelation 0.7167596491833258
http://example.org/ke#5_PM2.5_2020 http://example.org/ke#hasRUCC 5
http://example.org/ke#1_NO2_2022 http://example.org/ke#hasCorrelation 0.6236360276525605
http://example.org/ke#3_PM2.5_2019 http://example.org/ke#ofYear 2019
http://example.org/ke#3_PM2.5_2022 http://example.org/ke#forPollutant PM2.5
http://example.org/ke#3_PM2.5_2020 http://example.org/ke#ofYear 2020
http://example.org/ke#1_PM2.5_2023 http://example.org/ke#hasRUCC 1
http://example.org/ke#2_NO2_2022 http://example.org/ke#hasCorrelation 0.7394944700457898
http://example.org/ke#3_NO2_2023 http://example.org/ke#hasCorrelation 0.5955145012189761
http://example.org/ke#2_NO2_2021 http://example.org/ke#hasCorrelation 0.75077191546002

In [261]:
query = """
    PREFIX ex: <http://example.org/ke#>
    SELECT ?rucc ?pollutant ?pollutant_corr
    WHERE {
        ?rucc_pollutant_year ex:hasRUCC ?rucc ;
                              ex:forPollutant ?pollutant ;
                              ex:hasCorrelation ?pollutant_corr .
                              
    }
"""
# Execute the query
results = g.query(query)

# Print the results
for row in results:
    print(f"RUCC: {row['rucc']}, Pollutant: {row['pollutant']}, Correlation: {row['pollutant_corr']}")


RUCC: 1, Pollutant: NO2, Correlation: 0.6622303286995342
RUCC: 1, Pollutant: NO2, Correlation: 0.6387733574895565
RUCC: 1, Pollutant: NO2, Correlation: 0.562449544148006
RUCC: 1, Pollutant: NO2, Correlation: 0.6182243254885154
RUCC: 1, Pollutant: NO2, Correlation: 0.6236360276525605
RUCC: 1, Pollutant: NO2, Correlation: 0.5272055249260106
RUCC: 2, Pollutant: NO2, Correlation: 0.8263575459112502
RUCC: 2, Pollutant: NO2, Correlation: 0.7779677575821417
RUCC: 2, Pollutant: NO2, Correlation: 0.7511202886052164
RUCC: 2, Pollutant: NO2, Correlation: 0.7507719154600286
RUCC: 2, Pollutant: NO2, Correlation: 0.7394944700457898
RUCC: 2, Pollutant: NO2, Correlation: 0.7167596491833258
RUCC: 3, Pollutant: NO2, Correlation: 0.6620087902054008
RUCC: 3, Pollutant: NO2, Correlation: 0.6633151602373258
RUCC: 3, Pollutant: NO2, Correlation: 0.6259972538756795
RUCC: 3, Pollutant: NO2, Correlation: 0.5275963879319333
RUCC: 3, Pollutant: NO2, Correlation: 0.5530474408998353
RUCC: 3, Pollutant: NO2, Correla

In [262]:
from pyvis.network import Network
net = Network(height='750px', width='100%', directed=True, filter_menu=True, neighborhood_highlight=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]
    
    # subject_type = str(subject).split("#")[-2]
    # object_type = str(object).split("#")[-2]
    if subject_label not in net.node_ids:
        # if "ke" in subject_type:
        subject_title = subject_label
        # else:
            # subject_title = subject_type + ": " + subject_label
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        # if "ke" in object_type:
        object_title = object_label
        # else:
            # object_title = object_type + ": " + object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasRUCC': 'red',
        'ofYear': 'green',
        'belongsTo': 'black',
        'forPollutant': 'pink',
        'hasPollutantCorrelation': 'purple',
        
    }.get(predicate_label, 'grey')
        
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "kg_corr_2.html"))
net.show("kg_corr_2.html", local=False, notebook=False)

kg_corr_2.html


## Correlation - Population vs AQI (Question 1)

In [283]:
df3

Unnamed: 0,RUCC,Year,AQI_Correlation
0,1,2018,0.662230
1,1,2019,0.638773
2,1,2020,0.562450
3,1,2021,0.618224
4,1,2022,0.623636
...,...,...,...
67,5,2019,0.000000
68,5,2020,0.000000
69,5,2021,0.000000
70,5,2022,0.000000


In [288]:
''' Correlation 3 - Population vs AQI'''
from rdflib import Graph, Literal, RDF, URIRef, Namespace

EX = Namespace("http://example.org/ke#")
g = Graph()

for index, row in df3.iterrows():
    rucc_year = EX[f"{row['RUCC']}_{row['Year']}"]
    rucc = Literal(row['RUCC'], datatype=XSD.integer)
    year = Literal(row['Year'], datatype=XSD.integer)
    aqi_corr = Literal(row['AQI_Correlation'])
    
    g.add((rucc_year, EX.hasRUCC, rucc))
    g.add((rucc_year, EX.ofYear, year))
    g.add((rucc_year, EX.hasCorrelation, aqi_corr))
    

In [289]:
query = """
    PREFIX ex: <http://example.org/ke#>
    SELECT ?rucc ?year ?aqi_corr
    WHERE {
        ?rucc_year ex:hasRUCC ?rucc ;
                ex:ofYear ?year ;
              ex:hasCorrelation ?aqi_corr .                
    }
"""
# Execute the query
results = g.query(query)

# Print the results
for row in results:
    print(f"RUCC: {row['rucc']}, Pollutant: {row['year']}, Correlation: {row['aqi_corr']}")


RUCC: 1.0, Pollutant: 2018.0, Correlation: 0.6622303286995342
RUCC: 1.0, Pollutant: 2019.0, Correlation: 0.6387733574895565
RUCC: 1.0, Pollutant: 2020.0, Correlation: 0.562449544148006
RUCC: 1.0, Pollutant: 2021.0, Correlation: 0.6182243254885154
RUCC: 1.0, Pollutant: 2022.0, Correlation: 0.6236360276525605
RUCC: 1.0, Pollutant: 2023.0, Correlation: 0.5272055249260106
RUCC: 2.0, Pollutant: 2018.0, Correlation: 0.8263575459112502
RUCC: 2.0, Pollutant: 2019.0, Correlation: 0.7779677575821417
RUCC: 2.0, Pollutant: 2020.0, Correlation: 0.7511202886052164
RUCC: 2.0, Pollutant: 2021.0, Correlation: 0.7507719154600286
RUCC: 2.0, Pollutant: 2022.0, Correlation: 0.7394944700457898
RUCC: 2.0, Pollutant: 2023.0, Correlation: 0.7167596491833258
RUCC: 3.0, Pollutant: 2018.0, Correlation: 0.6620087902054008
RUCC: 3.0, Pollutant: 2019.0, Correlation: 0.6633151602373258
RUCC: 3.0, Pollutant: 2020.0, Correlation: 0.6259972538756795
RUCC: 3.0, Pollutant: 2021.0, Correlation: 0.5275963879319333
RUCC: 3.0

In [290]:
from pyvis.network import Network
net = Network(height='750px', width='100%', directed=True, neighborhood_highlight=True, filter_menu=True)

# Add nodes and edges to the network
for subject, predicate, object in g:
    subject_label = str(subject).split("#")[-1]
    predicate_label = str(predicate).split("#")[-1]
    object_label = str(object).split("#")[-1]
    
    # subject_type = str(subject).split("#")[-2]
    # object_type = str(object).split("#")[-2]
    if subject_label not in net.node_ids:
        subject_title = subject_label
        # else:
            # subject_title = subject_type + ": " + subject_label
        net.add_node(subject_label, 
                     label=subject_label, 
                     title=subject_title, 
                     labelHighlightBold=True)
        
    if object_label not in net.node_ids:
        object_title = object_label
            
        net.add_node(object_label, 
                     label=object_label, 
                     title=object_title, 
                     labelHighlightBold=True)

    edge_color = {
        'hasRUCC': 'red',
        'ofYear': 'green',
        'hasCorrelation': 'purple',
    }.get(predicate_label, 'grey')
        
    net.add_edge(subject_label, 
                     object_label, 
                     title=predicate_label, 
                     label=predicate_label, 
                     color=edge_color)
    
# # Show the network
net.save_graph(os.path.join(results_path, "kg_corr_3.html"))
net.show("kg_corr_3.html", local=False, notebook=False)

kg_corr_3.html
