## LOST
* https://github.com/camelot-dev/camelot
* https://stackoverflow.com/questions/62044535/how-to-extract-tables-from-pdf-using-camelot
* https://camelot-py.readthedocs.io/en/master/user/quickstart.html#specify-page-numbers

* pip install "camelot-py[base]"

In [1]:
import camelot
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
def open_pdf(file_name:str, pages: str):
    """
    Returns concatted dataframe 
    extracted from a PDF. 
    pip install "camelot-py[base]"
    
    file_path (str): PDF must be saved locally in the 
    directory. Ex: "./alameda_b_2000.pdf"
    
    pages (str): The pages the tables are located in. 
    Ex: "5,6,7" or "41,41". 
    """
    opened_pdf = camelot.read_pdf(f"./{file_name}.pdf", pages = pages,flavor='stream', strip_text='.\n', edge_tol=500)
    
    # Concat the different tables into one dataframe
    # https://stackoverflow.com/questions/62044535/how-to-extract-tables-from-pdf-using-camelot
    final = pd.DataFrame()
    for page, pdf_table in enumerate(opened_pdf):           
        table = opened_pdf[page].df
        final = pd.concat([final, table], axis=0)
    
    return final 

In [4]:
alameda_b_2000 = open_pdf("alameda_b_2000", "41,42")

In [14]:
monterey_x_2016 = open_pdf("monterey_x_2016", "3")

In [6]:
madera_t_2006 = open_pdf("madera_t_2006", "12")

In [7]:
def clean_pdf(df, row_to_keep:int, 
              file_name:str,
              notes: str):
    """
    Returns a cleaner dataframe.
    row_to_keep (int): the beginning of the dataframe.
    EX: 6, 13, 20. 
    """
    # Get rid of the unnecessary header info
    df = df.iloc[row_to_keep:].reset_index(drop=True)

    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Snakecase
    df = to_snakecase(df)
    
    # Add program
    cleaned_file_path = file_name.title().replace('_',' ')
    df['program'] = f"LOST {cleaned_file_path}"
    
    # Add county
    county = file_name.split("_")[0].replace("./","").title()
    df['county'] = county
    
    # Drop rows with more than 2 missing values
    df = df.dropna(axis = 0, thresh=2)
    
    # Add notes
    df['notes'] = notes

    return df 

In [8]:
alameda_b_2000= clean_pdf(alameda_b_2000, 5, "alameda_b_2000", "")

In [11]:
alameda_b_2000.sample()

Unnamed: 0,project,sponsor,cost__$m_,sales_tax__$m_,program,county,notes
50,San Pablo Corridor Improvements,AC Transit,$1900,$1900,LOST Alameda B 2000,Alameda,


In [10]:
madera_t_2006 = clean_pdf(madera_t_2006, 0, "madera_t_2006", "")

In [13]:
madera_t_2006.sample(5)

Unnamed: 0,map_#route,limits,description,cost*1,_at_least_20%_*2,funds_*3,"$164,354,000",program,county,notes
29,2G Schnoor,Trevor to Sunset,lanes,"$830,000","$830,000",$0,$0,LOST Madera T 2006,Madera,
10,1IGateway (SR 145),Yosemite to SR 99,Reconstruct/widen from 2 to 4 lanes,"$2,800,000","$560,000","$2,240,000","$60,736,168",LOST Madera T 2006,Madera,
45,,,,"$76,512,593","$66,032,593","$10,480,000",,LOST Madera T 2006,Madera,
30,,,Pavement rehab & restripe to 4,,,,,LOST Madera T 2006,Madera,
43,2R Fig Tree Overpass *12 Over SR 99,,Overpass,"$10,800,000","$10,800,000",$0,$0,LOST Madera T 2006,Madera,


In [15]:
monterey_x_2016

Unnamed: 0,0,1
0,,Our Plan
1,ROADS &,
2,POTHOLES,
3,,"Regional Safety, Mobility & Walkability Projects - $240 million (est)"
4,,• Constructs regionally significant improvements selected based on input from key
5,,"community leaders, transportation planners and engineers, and your elected"
6,,representatives
7,,"• Includes traffic flow improvements on Highways 68, 101 and 1"
8,,• Includes safety improvements at the top collision locations and corridors
9,,within the county
