# Development of the DataDictionary class

## Set up a filepath to a rtf data dictionary

In [54]:
fp=r'C:\Users\cvskf\OneDrive - Loughborough University\_Data\United_Kingdom_Time_Use_Survey_2014-2015' + \
   r'\UKDA-8128-tab\mrdoc\allissue\uktus15_household_ukda_data_dictionary.rtf'

## Set up DataDictionary class

In [55]:
class DataDictionary(): 
    """A class for reading a UK Data Service .rtf data dictionary file
    """
    
    def __init__(self,fp=None):
        """
        
        Arguments:
            fp (str): a filepath to a UK Data Service .rtf data dictionary file
        
        """
        if fp: self.read_rtf(fp)
        

In [56]:
dd=DataDictionary()

## get_list

Returns a list of dictionaries which contain the information in the rdf file.

In [74]:
from pprint import pprint

def get_variable_list(self):
    """Returns a list which contains the information in a UK Data Service .rtf data dictionary file.
    
    Returns:
        - (list): a list of dictionaries. Each dictionary has the following items: 
             {'pos': ... ,
              'variable': ... ,
              'variable_label': ... ,
              'variable_type': ... ,
              'SPSS_measurement_level': ... ,
              'SPSS_user_missing_values': ... ,
              'value_labels': ... }
    
    """
    
    def get_variable_text(rtf_file):
        "Returns a list of variable_texts for each variable"
        st='Pos. = '
        return rtf_file.split(st)[1:]
        
    def get_variable_name(variable_text):
        st='Variable = '
        b=variable_text.split(st)[1]
        return b[b.find(' ')+1:b.find('\t')]
    
    def find_pos(rtf):
        a=rtf
        b=a
        return b[b.find(' ')+1:b.find('\t')]
    
    def find_variable_label(rtf):
        a=rtf
        b=a.split('Variable label = ')[1]
        return b[b.find(' ')+1:b.find('\\par')]
    
    def find_variable_type(rtf):
        if not 'This variable is  ' in rtf: return ''
        a=rtf
        b=a.split('This variable is  ')[1]
        i1=b.find(' ')+1
        i2=i1+b[i1:].find('}')
        return b[i1:i2]
    
    def find_SPSS_measurement_level(rtf):
        if not 'the SPSS measurement level is ' in rtf: return ''
        a=rtf
        b=a.split('the SPSS measurement level is ')[1]
        i1=b.find(' ')+1
        i2=i1+b[i1:].find('\\par')
        return b[i1:i2]
    
    def find_SPSS_user_missing_values(rtf):
        if not 'SPSS user missing values = ' in rtf: return ''
        a=rtf
        d=a.split('SPSS user missing values = ')
        if len(d)<2: return None
        e=d[1]
        i1=e.find(' ')+1
        i2=i1+e[i1:].find('\\par')
        f=e[i1:i2]
        g=f.split(' ')
        i=' '.join([g[0],g[2],g[4]])
        return i
    
    def find_value_labels(rtf):
        if not 'Value = ' in rtf: return ''
        a=rtf
        d=a.split('Value = ')[1:]
        z={}
        for e in d:
            value=e[e.find(' ')+1:e.find('\t')]
            value=float(value)
            f=e.split('Label = ')[1]
            label=f[f.find(' ')+1:f.find('\\par')]
            z[value]=label
        #print(z)
        return z
    
    variable_texts=get_variable_text(self.rtf)
    #pprint(variable_texts[0:2])
    
    result=[]
    for variable_text in variable_texts:
        d={'pos':find_pos(variable_text),
           'variable':get_variable_name(variable_text),
            'variable_label':find_variable_label(variable_text),
            'variable_type':find_variable_type(variable_text),
            'SPSS_measurement_level':find_SPSS_measurement_level(variable_text),
            'SPSS_user_missing_values':find_SPSS_user_missing_values(variable_text),
            'value_labels':find_value_labels(variable_text)                 
            }
        result.append(d)
        
    return result
    
DataDictionary.get_variable_list=get_variable_list

In [75]:
dd=DataDictionary()
with open (fp, "r") as myfile:
        dd.rtf=myfile.read()
dd.get_variable_list()[:4]

[{'pos': '1',
  'variable': 'serial',
  'variable_label': 'Household number',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': ''},
 {'pos': '2',
  'variable': 'strata',
  'variable_label': 'Strata',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {-2.0: 'Schedule not applicable'}},
 {'pos': '3',
  'variable': 'psu',
  'variable_label': 'Primary sampling unit',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {-2.0: 'Schedule not applicable'}},
 {'pos': '4',
  'variable': 'HhOut',
  'variable_label': 'Final outcome - household',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {0.0: 'Outstanding',
   640.0: 'Unknown whether address is residential: No contact after 6+ calls',
   214.0: 'Productive : Household q

## read_rtf

Read in a .rdf data dictionary file

In [76]:
def read_rtf(self,fp):
    """Reads a UK Data Service .rtf data dictionary file and creates the 'variable_list' attribute
        
    Arguments:
        fp (str): a filepath to a UK Data Service .rtf data dictionary file

    """
    with open (fp, "r") as myfile:
        self.rtf=myfile.read()
    
    self.variable_list=self.get_variable_list()
    
DataDictionary.read_rtf=read_rtf

Opens the rdf file and prints the contents

In [77]:
dd=DataDictionary()
dd.read_rtf(fp)
dd.rtf[:1000]

'{\\rtf1\\ansi\\deff0\\deftab1200{\\fonttbl{\\f0\\fswiss MS Sans Serif;}{\\f1\\froman\\fcharset2 Symbol;}{\\f2\\fswiss Arial;}{\\f3\\fswiss\\fprq2 Arial;}{\\f4\\fmodern\\fprq5 Courier New;}{\\f5\\fswiss Arial;}}{\\colortbl;\\red0\\green0\\blue0;\\red255\\green0\\blue0;\\red100\\green100\\blue100;\\red0\\green0\\blue255;\\red10\\green10\\blue160;}\\deflang2057\\pard\\plain\\f2\\fs20\\cf1\\par {\\fs28\\b\\ul UK Data Archive Data Dictionary\\par\\par }{\\b\\f2\\fs20\\cf1\\ File-level information:\\par\\par }{\\f2\\fs20\\cf1 File Name = \t\t\\f2\\fs20\\cf5uktus15_household\\par }{\\f2\\fs20\\cf1 Number of variables = \t\\f2\\fs20\\cf5 335\\par }{\\f2\\fs20\\cf1 Number of cases = \t\\f2\\fs20\\cf5 4733\\par\\par\\par }{\\f2\\fs20\\cf1\\b Variable-level information:\\par }{\\cf1\\b\\par Pos. = }{\\f2\\fs20\\cf4 1\t}{\\b\\cf1 Variable = }{\\f2\\fs20\\cf4 serial\t}{\\b\\cf1 Variable label = }{\\cf4 Household number\\par }{\\cf3 This variable is  }{\\cf5\\i numeric}{\\cf3, the SPSS measurement 

In [78]:
dd.variable_list[:4]

[{'pos': '1',
  'variable': 'serial',
  'variable_label': 'Household number',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': ''},
 {'pos': '2',
  'variable': 'strata',
  'variable_label': 'Strata',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {-2.0: 'Schedule not applicable'}},
 {'pos': '3',
  'variable': 'psu',
  'variable_label': 'Primary sampling unit',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {-2.0: 'Schedule not applicable'}},
 {'pos': '4',
  'variable': 'HhOut',
  'variable_label': 'Final outcome - household',
  'variable_type': 'numeric',
  'SPSS_measurement_level': 'SCALE',
  'SPSS_user_missing_values': '',
  'value_labels': {0.0: 'Outstanding',
   640.0: 'Unknown whether address is residential: No contact after 6+ calls',
   214.0: 'Productive : Household q

## get_variable_dict

In [79]:
def get_variable_dict(self,variable):
    """Returns the dictionary for a variable
    
    Arguments:
        - variable (str): the name of the variable
        
    Returns:
        - (dict): A dictionary with the following items: 
             {'pos': ... ,
              'variable': ... ,
              'variable_label': ... ,
              'variable_type': ... ,
              'SPSS_measurement_level': ... ,
              'SPSS_user_missing_values': ... ,
              'value_labels': ... }
    
    """
    return [x for x in self.variable_list if x['variable']==variable][0]
    
DataDictionary.get_variable_dict=get_variable_dict

In [80]:
dd.get_variable_dict('psu')

{'pos': '3',
 'variable': 'psu',
 'variable_label': 'Primary sampling unit',
 'variable_type': 'numeric',
 'SPSS_measurement_level': 'SCALE',
 'SPSS_user_missing_values': '',
 'value_labels': {-2.0: 'Schedule not applicable'}}

## get_variable_names

In [81]:
def get_variable_names(self):
    """Returns a list of all variable names
   
    Returns:
        - (list): A list of strings
    
    """
    
    return [x['variable'] for x in self.variable_list]
    
DataDictionary.get_variable_names=get_variable_names

In [82]:
dd.get_variable_names()

['serial',
 'strata',
 'psu',
 'HhOut',
 'hh_wt',
 'IMonth',
 'IYear',
 'DM014',
 'DM016',
 'DM510',
 'DM1115',
 'DM1619',
 'NumAdult',
 'NumChild',
 'NumSSex',
 'NumCPart',
 'NumMPart',
 'NumCivP',
 'DVHsize',
 'Relsize',
 'SelPer',
 'CCPersNo',
 'Accom',
 'Hhldr1',
 'Hhldr2',
 'Hhldr3',
 'Hhldr4',
 'Hhldr5',
 'Hhldr6',
 'Hhldr7',
 'Hhldr8',
 'Hhldr9',
 'Hhldr10',
 'HiHNum',
 'Tenure',
 'NumRooms',
 'TVSet',
 'TVSetNum',
 'Cable',
 'CableNum',
 'Games',
 'GamesNum',
 'Land',
 'LandNum',
 'Mob',
 'MobPerm',
 'Comp',
 'CompNum',
 'Microwav',
 'Dishwash',
 'WashMach',
 'Tumble',
 'Freezer',
 'HmIntnet',
 'IntAcc1',
 'IntAcc2',
 'IntAcc3',
 'IntAcc4',
 'IntAcc5',
 'IntAcc6',
 'IntPurch',
 'VehOwn',
 'VehNum',
 'Repairs',
 'Wages',
 'SelfEmp',
 'Pension',
 'UnempBen',
 'BenOth',
 'Invest',
 'IncOth',
 'Income',
 'IncCat',
 'Help1',
 'Help2',
 'Help3',
 'Help4',
 'Help5',
 'Help6',
 'Help7',
 'Help8',
 'Help9',
 'Help10',
 'Help11',
 'Help12',
 'Help13',
 'Help14',
 'Help15',
 'Help16',
 'H