In [1]:
# Import libraries that we're going to use
import pandas as pd
import shapefile
import glob
import os

In [2]:
# Create a list of all the files in your folder that you want to extract the data from:
files = glob.glob(r"F:\pc_by_year\pc_by_year\test\pc*.shp")

In [3]:
# View list of filenames:
files

['F:\\pc_by_year\\pc_by_year\\test\\pc1994_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc1995_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc1996_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc1997_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc1998_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc1999_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2000_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2001_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2002_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2003_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2004_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2005_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2006_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2007_810111_intersects.shp',
 'F:\\pc_by_year\\pc_by_year\\test\\pc2008_810111_intersects.s

In [4]:
# MSOA data
# Read in MSOA shapefile
shapefile_path = r"F:\pc_by_year\pc_by_year\test\msoa_master.shp"
sf = shapefile.Reader(shapefile_path)

# Grab the shapefile's field names (omit the first psuedo field)
fields = [x[0] for x in sf.fields][1:]
records = sf.records()
shps = [s.points for s in sf.shapes()]

# Write the records into a dataframe
shapefile_dataframe = pd.DataFrame(columns=fields, data=records)

# Add the coordinate data to a column called "coords"
msoa = shapefile_dataframe.assign(coords=shps)

In [5]:
# Subset the dataset to only contain MSOA codes
msoa = msoa[['msoa11']]

In [6]:
# View MSOA subset dataset
msoa

Unnamed: 0,msoa11
0,S02002057
1,E02004809
2,E02000992
3,S02002353
4,E02006418
5,S02001805
6,E02003203
7,E02006294
8,S02002053
9,S02001536


In [8]:
# Create a loop which will loop over each of the files in the file list to...

for file in files: 
    # Specify the path of the shapefile using the file name from the list of files
    shapefilepath = "{fname}".format(fname=file)
    # Read in the shapefile
    sf = shapefile.Reader(shapefilepath)
    # Grab the shapefile's field names (omit the first psuedo field)
    fields = [x[0] for x in sf.fields][1:]
    records = sf.records()
    shps = [s.points for s in sf.shapes()]
    # Print a statement stating which file is being processed, so can see how far/well the code is doing
    # Again using file name from the file list currently looping over
    print('Processing {fname}'.format(fname=file))
    
    # Write the records into a dataframe
    shapefile_dataframe = pd.DataFrame(columns=fields, data=records)

    # Add the coordinate data to a column called "coords"
    postcode = shapefile_dataframe.assign(coords=shps)
    
    # Merge the shapefile dataframe with the MSOA file to obtain only the postcodes located within the listed MSOAs
    merged =  msoa.merge(postcode, how='inner', left_on= 'msoa11', right_on='msoa11')
    
    # Sort values by MSOA 
    sorted_MSOA = merged.sort_values('msoa11')
    
    # Save file as csv using the filename of the shapefile
    sorted_MSOA.to_csv(r'{fname}.csv'.format(fname=file))

Processing F:\pc_by_year\pc_by_year\test\pc1994_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc1995_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc1996_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc1997_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc1998_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc1999_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2000_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2001_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2002_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2003_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2004_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2005_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2006_810111_intersects.shp
Processing F:\pc_by_year\pc_by_year\test\pc2007_810111_intersects.shp
Processing F:\pc_by_

In [None]:
# Other code... 

In [13]:
# If want to subset datasets, can do so by specifying columns 

blah2 = blah[['msoa11','frequency', 'sum_x_pc_p', 'sum_y_pc_p', 'count_post', 'first_msoa', 'sum_popula', 'fid_buffer', 'oid_',
               'name', 'folderpath', 'symbolid', 'altmode', 'base', 'clamped', 'extruded', 'snippet', 'popupinfo', 'shape_leng',
              'shape_area', 'x_centorid', 'y_centroid', 'buff_dist', 'orig_fid', '_merge', 'flag_polyg','postcode_x',
              'xcoordinat_x', 'ycoordinat_x', 'coa11_x', 'lsoa11_x', 'msoa11_1', 'lad11_x', 'coords_x', 'gid', 
              'year_y', 'postcode_y' ]]

In [9]:
# View list of all the headings of the dataframe
list(blah)

['fid1',
 'fid_msoa_p',
 'fid_',
 'msoa11',
 'frequency',
 'sum_x_pc_p',
 'sum_y_pc_p',
 'count_post',
 'first_msoa',
 'sum_popula',
 'fid_buffer',
 'oid_',
 'name',
 'folderpath',
 'symbolid',
 'altmode',
 'base',
 'clamped',
 'extruded',
 'snippet',
 'popupinfo',
 'shape_leng',
 'shape_area',
 'x_centorid',
 'y_centroid',
 'buff_dist',
 'orig_fid',
 '_merge',
 'flag_polyg',
 'year_x',
 'postcode_x',
 'xcoordinat_x',
 'ycoordinat_x',
 'coa11_x',
 'lsoa11_x',
 'msoa11_1',
 'lad11_x',
 'coords_x',
 'gid',
 'year_y',
 'postcode_y',
 'xcoordinat_y',
 'ycoordinat_y',
 'coa11_y',
 'lsoa11_y',
 'lad11_y',
 'ed81',
 'coa01',
 'lsoa01',
 'ladua01',
 'coords_y']