In [55]:
import sys
sys.path.append("..")
%reload_ext autoreload
%autoreload 2
import os
import pandas as pd
import pyreadstat as pyr
import json
import numpy as np
from spss_import import read_sav 
pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [56]:
#help(pyr.read_sav)

In [57]:
spssfile = f"files/SPSS_Example2.sav"
df, df_meta = read_sav(spssfile)
df.head()

Unnamed: 0,RID,MARST,PWT,testvar
0,10000001,3,537,1
1,10000002,1,231,1
2,10000003,2,599,1
3,10000004,1,4003,7
4,10000005,4,598,8


In [58]:
Label = df_meta.column_names_to_labels
Values = df_meta.variable_value_labels
Missing = df_meta.missing_ranges
Format = df_meta.original_variable_types
Measure = df_meta.variable_measure

In [59]:
from lxml import etree
from xml_functions import remove_empty_elements, add_cdi_element, add_identifier, add_ddiref

# Define the namespace
nsmap = {'cdi': 'http://ddialliance.org/Specification/DDI-CDI/1.0/XMLSchema/'}
# Create the root element
root = etree.Element(etree.QName(nsmap['cdi'], 'DDICDIModels'), nsmap=nsmap)
root.set('{http://www.w3.org/2001/XMLSchema-instance}schemaLocation',
         'http://ddialliance.org/Specification/DDI-CDI/1.0/XMLSchema/ https://ddi-cdi-resources.bitbucket.io/2023-11-12/encoding/xml-schema/ddi-cdi.xsd')
agency='int.esseric'

In [60]:
# DataStore
def generate_DataStore(df_meta):
    element = add_cdi_element(root, 'DataStore')
    add_cdi_element(element, 'allowsDuplicates', "false")
    add_identifier(element, f"#DataStore")
    add_cdi_element(element, 'recordCount', df_meta.number_rows)
    LogicalRecord = add_cdi_element(element, 'DataStore_has_LogicalRecord')
    add_ddiref(LogicalRecord, f"#LogicalRecord", agency, "LogicalRecord")

In [61]:
# logicalRecord
def generate_LogicalRecord(df_meta):
    element = add_cdi_element(root, 'LogicalRecord')
    add_identifier(element, f"#LogicalRecord")
    LogicalRecord_organizes_DataSet = add_cdi_element(element, 'LogicalRecord_organizes_DataSet')
    add_ddiref(LogicalRecord_organizes_DataSet, f"#WideDataSet", agency, "WideDataSet")
    for idx, variable in enumerate(df_meta.column_names):
        LogicalRecord_has_InstanceVariable = add_cdi_element(element, 'LogicalRecord_has_InstanceVariable')
        add_ddiref(LogicalRecord_has_InstanceVariable, f"#InstanceVariable-{variable}", agency, "InstanceVariable")

In [62]:
# WideDataSet
def generate_WideDataSet(df_meta):       
    element = add_cdi_element(root, 'WideDataSet')
    add_identifier(element, f"#WideDataSet")
    DataSet_isStructuredBy_DataStructure = add_cdi_element(element, 'DataSet_isStructuredBy_DataStructure')
    add_ddiref(DataSet_isStructuredBy_DataStructure, f"#WideDataStructure", agency, "WideDataStructure")

    
    json_ld_data = []
    elements = {
        "@id": f"#wideDataSet",
        "@type": "WideDataSet",
        "isStructuredBy": "#wideDataStructure"
    }


In [63]:
def generate_InstanceVariable_xml(df_meta):
    # Iterate through column names and associated index
    for idx, variable in enumerate(df_meta.column_names):
        element = add_cdi_element(root, 'InstanceVariable')
        displayLabel = add_cdi_element(element, 'displayLabel')
        languageSpecificString = add_cdi_element(displayLabel, 'languageSpecificString')
        add_cdi_element(languageSpecificString, 'content', f"{df_meta.column_labels[idx]}")
        add_identifier(element, f"#InstanceVariable-{variable}")
        name = add_cdi_element(element, 'name')
        add_cdi_element(name, 'name', f"{variable}")
        hasIntendedDataType = add_cdi_element(element, 'hasIntendedDataType')
        add_cdi_element(hasIntendedDataType, 'name', f"{df_meta.original_variable_types[variable]}")

        # Check if variable has sentinel concepts
        if variable in df_meta.missing_ranges or (len(df_meta.missing_ranges) == 0 and variable in df_meta.missing_user_values):
            RepresentedVariable_takesSentinelValuesFrom_SentinelValueDomain = add_cdi_element(element, 'RepresentedVariable_takesSentinelValuesFrom_SentinelValueDomain')
            add_ddiref(RepresentedVariable_takesSentinelValuesFrom_SentinelValueDomain, f"#SentinelValueDomain-{variable}", agency, "SentinelValueDomain")
        RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain = add_cdi_element(element, 'RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain')
        add_ddiref(RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain, f"#substantiveValueDomain-{variable}", agency, 'SubstantiveValueDomain')

In [64]:
generate_DataStore(df_meta)
generate_LogicalRecord(df_meta)
generate_WideDataSet(df_meta)
generate_InstanceVariable_xml(df_meta)

In [65]:
# Add XML declaration and write XML file
xml_string = etree.tostring(root, encoding='UTF-8', xml_declaration=True, pretty_print=True)

# Add the comment as the second line
xml_string_with_comment = xml_string.replace(b'?>', b'?>\n<!-- CDI, version 1, 2024.01.20 -->', 1)

with open(r'files/CDI.xml', 'wb') as f:
    f.write(xml_string_with_comment)

##### 