In [3]:
import pandas as pd
import os.path
import pickle
from pathlib import Path

class MpcSpreadsheet(object):
    """Parent class for all ipums.metadata spreadsheet child classes.

    All spreadsheet classes in the ipums.metadata namespace inherit from
    the MpcSpreadsheet parent class. MpcSpreadsheet provides generic
    methods for accessing spreadsheet information by parsing Excel files
    and storing them in a Pandas data frame.
    """

    def __init__(self, xlpath, projdir):
        """Initialize parent class."""
        self.xlpath = xlpath
        self.projdir = projdir
        self.pklpath = self.get_pickle_path()
        fresh = self.fresh_pickle()
        if fresh:
            self.ws = pickle.load(open(pklpath, 'rb'))
        else:
            if os.path.exists(xlpath):
                self.ws = pd.read_excel(xlpath)
                utilities.pickle_dataframe(self.ws, pklpath)
            else:
                raise FileNotFoundError(
                    "File not found: " + xlpath)

        # as a convention, uppercase all columns
        # and strip trailing whitespace
        self.ws.columns = map(str.upper, self.ws.columns)
        self.ws.columns = map(str.rstrip, self.ws.columns)
    
    def get_pickle_path(self):
        """Return path to a pickle file based on project and path to xls file.

        get_pickle_path will take a project and a path to a spreadsheet and
        return the correct path to its associated pickle file, whether that
        pickle file exists or not.
        Since these pickle files are Pandas data frames,
        they are in pandas version-specific directories.
        Example:
            utilities.get_pickle_path('ipumsi', 'metadata/samples.xlsx')
            returns:
            /pkg/ipums/ipumsi/metadata/.pickle/0.19.1/metadata/samples.pkl"""


        xlPath = Path(xlpath)
        relpath = xlPath.relative_to(self.projdir)
        root = str(relpath.parent)
        pklpath = '/'.join(['.pickle', root, relpath.stem, '.pkl'])
        # stringify Path object makes return string cross-platform
        return str(Path(pklpath))

        
    def fresh_pickle(self):
        """Return a boolean as to whether the pickle file is fresh.

        A pickled spreadsheet is only valid if it:
            1. exists
            2. is newer than the last updated time for the spreadsheet.
        fresh_pickle() identifies whether the pickled file should be used.
        """
        if os.path.isfile(self.pklpath):
            if os.path.getmtime(self.pklpath) > os.path.getmtime(self.xlpath):
                return True
        return False

    def pickle_dataframe(self):
        """Pickle data frame to disk.

        pickle_dataframe() writes the data frame self.ws to disk.
        """
        try:
            print("pickling to:", self.pklpath)
            picklepath = Path(self.pklpath)
            pickle_dir = str(picklepath.parent)
            if not os.path.exists(pickle_dir):
                os.makedirs(str(pickle_dir))
            f = picklepath.open(mode='wb')
            pickle.dump(self.ws, f)
            f.close()
            return True
        except:
            return False



In [7]:

%%timeit
v  = MpcSpreadsheet('./LoughranMcDonald_MasterDictionary_2020.xlsx', '/whiskywithmlbackend/semanticAnalysis/files')

NameError: name 'xlpath' is not defined

In [None]:
%%timeit
for v in all_vars[0:1000]:
    sheet = MpcSpreadsheet('variables/' + v + '.xls', '/my/work/dir')