In [3]:
import pandas as pd
import numpy as np

import re
import csv
from io import StringIO
import itertools

In [10]:
class PxFieldDialect(csv.Dialect):
    strict = True
    skipinitialspace = True
    quoting = csv.QUOTE_ALL
    delimiter = ','
    quotechar = '"'
    lineterminator = '\n'
    
class PxDataDialect(csv.Dialect):
    strict = True
    skipinitialspace = True
    quoting = csv.QUOTE_ALL
    delimiter = ' '
    quotechar = '"'
    lineterminator = '\n'
    

def parse_px_field(field, _type=None):
    # _type: "field", "data"
    dialects = {
        "field": PxFieldDialect,
        "data": PxDataDialect,
    }
    
    r = csv.reader(StringIO(field), dialects.get(_type, dialects["field"])())
    
    out = []
    for line in r:
        for item in line:
            # Skip empty strings caused by linebreaks
            if item:
                out.append(item)
        
    return out

In [31]:
filename = "C:/Users/billy/Downloads/20181125051710546145CJQ02.px"
filename2 = "C:/Users/billy/Downloads/2018112521210546594PEA06.px"
filename3 = "C:/Users/billy/Downloads/201811252494610546741LRM07.px"

In [32]:
with open(filename3) as f:
    data = f.read()

## Separate fields

In [33]:
lines = data.split(";")
fields = [l.strip().split("=", 1) for l in lines if l.strip()]
field_dict = dict(fields)

field_dict

{'CHARSET': '"ANSI"',
 'AXIS-VERSION': '"2006"',
 'LANGUAGE': '"en"',
 'CREATION-DATE': '"20181125 02:49"',
 'DECIMALS': '0',
 'SHOWDECIMALS': '0',
 'MATRIX': '"LRM07"',
 'SUBJECT-AREA': '"Census of Population"',
 'SUBJECT-CODE': '"A01"',
 'TITLE': '"Persons on Live Register (Number) by Sex, Age Group, Social Welfare"\n"Office and Month"',
 'CONTENTS': '"Persons on Live Register (Number)"',
 'UNITS': '"Number"',
 'STUB': '"Sex","Age Group","Social Welfare Office"',
 'HEADING': '"Month"',
 'VALUES("Sex")': '"Male","Female"',
 'VALUES("Age Group")': '"Under 25 years","25 years and over"',
 'VALUES("Social Welfare Office")': '"Carlow County","Muine Bheag (Bagenalstown)","Carlow","Tullow","Cavan County",\n"Bailieboro","Ballyconnell","Belturbet","Cavan","Clare County","Ennis","Ennistymon","Kilrush","Tulla","Cork County",\n"Bandon","Bantry","Bantry (SWLO)","Carrigaline","Castletownbere","Clonakilty","Cobh","Cork City","Cork City (Abbeycourt)",\n"Cork City (Hanover)","Dunmanway","Fermoy","Kin

## Parse Headings

In [34]:
head = field_dict["HEADING"].strip("\"")
headers = parse_px_field(field_dict["VALUES(\"{}\")".format(head)])

headers[:5]

['2002M03', '2002M04', '2002M05', '2002M06', '2002M07']

## Find & Parse MultiIndex levels

* Find headers

In [35]:
# value_keys = [k for k in field_dict.keys() if k.startswith("VALUES")]
# keys = [re.match(r'VALUES\(\"(.*)\"\)', k).group(1) for k in value_keys]
# levels = [k for k in keys if k != head]

levels = parse_px_field(field_dict["STUB"])
levels

['Sex', 'Age Group', 'Social Welfare Office']

* Parse values

In [36]:
level_values = []
for lev in levels:
    level_values.append(parse_px_field(field_dict["VALUES(\"{}\")".format(lev)]))

levels_dict = dict(zip(levels, level_values))

levels_dict

{'Sex': ['Male', 'Female'],
 'Age Group': ['Under 25 years', '25 years and over'],
 'Social Welfare Office': ['Carlow County',
  'Muine Bheag (Bagenalstown)',
  'Carlow',
  'Tullow',
  'Cavan County',
  'Bailieboro',
  'Ballyconnell',
  'Belturbet',
  'Cavan',
  'Clare County',
  'Ennis',
  'Ennistymon',
  'Kilrush',
  'Tulla',
  'Cork County',
  'Bandon',
  'Bantry',
  'Bantry (SWLO)',
  'Carrigaline',
  'Castletownbere',
  'Clonakilty',
  'Cobh',
  'Cork City',
  'Cork City (Abbeycourt)',
  'Cork City (Hanover)',
  'Dunmanway',
  'Fermoy',
  'Kinsale',
  'Macroom',
  'Mallow',
  'Midleton',
  'Newmarket',
  'Passage West',
  'Skibbereen',
  'Youghal',
  'Donegal County',
  'Ballybofey',
  'Ballyshannon',
  'Buncrana',
  'Donegal',
  'Donegal Control',
  'Dunfanaghy',
  'Dungloe',
  'Killybegs',
  'Letterkenny',
  'Dublin County',
  'Apollo House (Tara Street)',
  'Balbriggan',
  'Ballyfermot',
  'Ballymun',
  'Bishop Square',
  'Blanchardstown',
  'Clondalkin',
  'Coolock',
  'Cork S

## Parse Data

* Split into stream of cells

In [37]:
data = parse_px_field(field_dict["DATA"], _type="data")

* Convert into table

In [38]:
lines = [data[i:i+len(headers)] for i in range(0, len(data), len(headers))]

lines

[['248',
  '246',
  '225',
  '244',
  '242',
  '247',
  '215',
  '188',
  '220',
  '245',
  '246',
  '260',
  '260',
  '255',
  '250',
  '261',
  '276',
  '283',
  '261',
  '223',
  '233',
  '248',
  '288',
  '289',
  '278',
  '251',
  '235',
  '243',
  '269',
  '290',
  '259',
  '235',
  '238',
  '261',
  '297',
  '286',
  '280',
  '259',
  '261',
  '268',
  '279',
  '274',
  '261',
  '236',
  '251',
  '258',
  '287',
  '283',
  '268',
  '257',
  '269',
  '261',
  '250',
  '261',
  '248',
  '236',
  '231',
  '222',
  '240',
  '240',
  '235',
  '248',
  '241',
  '263',
  '292',
  '307',
  '292',
  '271',
  '302',
  '330',
  '384',
  '419',
  '431',
  '446',
  '477',
  '494',
  '550',
  '577',
  '565',
  '558',
  '592',
  '656',
  '736',
  '807',
  '841',
  '842',
  '866',
  '949',
  '967',
  '962',
  '885',
  '860',
  '881',
  '924',
  '918',
  '930',
  '954',
  '944',
  '983',
  '1045',
  '1035',
  '1014',
  '953',
  '889',
  '887',
  '903',
  '886',
  '875',
  '896',
  '851',
  '880'

* Create MutliIndex from levels

In [39]:
ind = pd.MultiIndex.from_product(list(levels_dict.values()), names=levels_dict.keys())

* Construct DataFrame

In [40]:
df = pd.DataFrame(lines, columns=headers, index=ind)

* Flatten MultiIndex
* Convert all cell dtypes

In [47]:
df2 = df.reset_index().convert_objects()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [48]:
df2

Unnamed: 0,Sex,Age Group,Social Welfare Office,2002M03,2002M04,2002M05,2002M06,2002M07,2002M08,2002M09,...,2018M01,2018M02,2018M03,2018M04,2018M05,2018M06,2018M07,2018M08,2018M09,2018M10
0,Male,Under 25 years,Carlow County,248,246,225,244,242,247,215,...,286,297,287,284,290,296,315,312,279,269
1,Male,Under 25 years,Muine Bheag (Bagenalstown),52,49,47,56,58,57,47,...,49,49,43,39,39,42,44,42,41,36
2,Male,Under 25 years,Carlow,153,156,134,143,136,144,125,...,193,206,201,203,204,208,217,214,191,187
3,Male,Under 25 years,Tullow,43,41,44,45,48,46,43,...,44,42,43,42,47,46,54,56,47,46
4,Male,Under 25 years,Cavan County,211,198,199,197,202,209,184,...,268,281,254,236,245,241,252,250,223,214
5,Male,Under 25 years,Bailieboro,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
6,Male,Under 25 years,Ballyconnell,..,..,..,..,..,..,..,...,46,47,43,41,44,45,48,46,39,39
7,Male,Under 25 years,Belturbet,33,29,32,29,31,32,27,...,..,..,..,..,..,..,..,..,..,..
8,Male,Under 25 years,Cavan,178,169,167,168,171,177,157,...,222,234,211,195,201,196,204,204,184,175
9,Male,Under 25 years,Clare County,399,348,324,328,346,351,328,...,389,385,357,330,329,332,337,320,294,296
