# Data Extraction from Semi-Structured Files

## Install Libraries

In [1]:
!pip3 install -U openpyxl pandas tabula-py



## Extract Semi-Structured Data - Spreadsheet

In [28]:
import pandas as pd
import numpy as np
import datetime
import glob
from IPython.display import display
pd.options.display.max_columns = 10000

In [29]:
SEMI_STRUCTURED_EXCEL = "test_data/semi_structured_spreadsheet.xlsx"
SHEET_NAME = "data"

In [30]:
def is_day(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False
    
def is_valid_date(dt_str):
    try:
        dt = datetime.datetime.strptime(dt_str, '%Y-%m-%d')
        return True
    except ValueError:
        return False
    
def get_all_sheets(file_name):
       excel = pd.ExcelFile(file_name, engine="openpyxl")
       return excel.sheet_names

In [31]:
# Read the excel sheet, skip 3 rows, 2 headers
df = pd.read_excel(SEMI_STRUCTURED_EXCEL, engine="openpyxl", sheet_name=SHEET_NAME, header=[0,1], skiprows=3)

display(df.head(10))

# Drop the summary fields
df.drop(columns=["Total Sum of Units", "Total Sum of Order Amount"], level=0, inplace=True)

# Flatten columns
df.columns = [' '.join(col).strip() for col in df.columns.values]

# Rename columns
df.rename(columns = {"Unnamed: 0_level_0 Country": "Country", "Unnamed: 1_level_0 Salesperson": "Salesperson"}, inplace=True)

# Forward fill for Country and Salesperson
df.loc[:, ["Country","Salesperson"]] = df.loc[:, ["Country", "Salesperson"]].ffill()

# Drop the result rows
df = df[~df.Country.str.contains("Result")]

# Fill NaN with 0
df.fillna(0, inplace=True)

# Convert columns to rows
df = df.melt(id_vars=["Country", "Salesperson"], 
        var_name="Type", 
        value_name="Value")

# Create new column for year and type
df['Year'] = df.Type.str.slice(start=0, stop=4)
df['Type'] = df.Type.str.slice(start=5)

# Change the description
df['Type'].replace({"Sum of Units": "Units", "Sum of Order Amount": "Order Amount"}, inplace=True)

# Pivot it
df = df.pivot(index=["Country", "Salesperson", "Year"], columns=["Type"],values="Value")

# Reset the index
df.reset_index(inplace=True)

display(df.head(20))

# Save to CSV
df.to_csv(path_or_buf="test_data/structured_data.csv", index=False)

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,2011,2011,2012,2012,2013,2013,Total Sum of Units,Total Sum of Order Amount
Unnamed: 0_level_1,Country,Salesperson,Sum of Units,Sum of Order Amount,Sum of Units,Sum of Order Amount,Sum of Units,Sum of Order Amount,Unnamed: 8_level_1,Unnamed: 9_level_1
0,UK,Bromley,232.0,24756.89,228.0,40396.64,73.0,9894.51,533.0,75048.04
1,,Coghill,81.0,4029.25,39.0,4657.11,,,120.0,8686.36
2,,Farnham,170.0,14055.87,44.0,5892.65,17.0,2560.4,231.0,22508.92
3,,Gillingham,397.0,40826.37,276.0,17181.58,202.0,14519.68,875.0,72527.63
4,,Gloucester,209.0,31433.16,143.0,19691.89,135.0,17667.2,487.0,68792.25
5,,Rayleigh,422.0,59827.19,268.0,41903.64,131.0,15232.16,821.0,116962.99
6,UK Result,,1511.0,174928.73,998.0,129723.51,558.0,59873.95,3067.0,364526.19
7,USA,Bromley,58.0,7553.95,27.0,3654.0,7.0,1101.2,92.0,12309.15
8,,Callahan,623.0,49400.07,337.0,43263.95,200.0,18059.5,1160.0,110723.52
9,,Coghill,885.0,120626.31,520.0,46505.9,405.0,49945.11,1810.0,217077.32


Type,Country,Salesperson,Year,Order Amount,Units
0,UK,Bromley,2011,24756.89,232.0
1,UK,Bromley,2012,40396.64,228.0
2,UK,Bromley,2013,9894.51,73.0
3,UK,Coghill,2011,4029.25,81.0
4,UK,Coghill,2012,4657.11,39.0
5,UK,Coghill,2013,0.0,0.0
6,UK,Farnham,2011,14055.87,170.0
7,UK,Farnham,2012,5892.65,44.0
8,UK,Farnham,2013,2560.4,17.0
9,UK,Gillingham,2011,40826.37,397.0


## Extract Semi-Structured Data - PDF

In [6]:
SEMI_STRUCTURED_PDF ="test_data/semi_structured_pdf.pdf" 

In [26]:
import tabula

dfs = tabula.read_pdf(SEMI_STRUCTURED_PDF, pages='all')
df = dfs[0]

# Convert first 2 rows to  columns
df.columns=pd.MultiIndex.from_arrays(df.iloc[0:2].values)
df=df.iloc[2:]

df.head(10)

Unnamed: 0_level_0,NaN,NaN,2011,NaN,NaN,2012,NaN,NaN,2013,NaN,NaN,Total Sum of Units,Total Sum of Order Amount
Unnamed: 0_level_1,Country,Salesperson,Sum of Units,NaN,Sum of Order Amount,Sum of Units,NaN.1,Sum of Order Amount.1,Sum of Units,NaN.2,Sum of Order Amount.2,NaN,NaN
2,UK,Bromley,,232.0,24756.89,228,,40396.64,,73.0,9894.51,533,75048.04
3,,Coghill,,81.0,4029.25,39,,4657.11,,,,120,8686.36
4,,Farnham,,170.0,14055.87,44,,5892.65,,17.0,2560.4,231,22508.92
5,,Gillingham,,397.0,40826.37,276,,17181.58,,202.0,14519.68,875,72527.63
6,,Gloucester,,209.0,31433.16,143,,19691.89,,135.0,17667.2,487,68792.25
7,,Rayleigh,,422.0,59827.19,268,,41903.64,,131.0,15232.16,821,116962.99
8,UK Result,,,1511.0,174928.73,998,,129723.51,,558.0,59873.95,3067,364526.19
9,USA,Bromley,,58.0,7553.95,27,,3654.0,,7.0,1101.2,92,12309.15
10,,Callahan,,623.0,49400.07,337,,43263.95,,200.0,18059.5,1160,110723.52
11,,Coghill,,885.0,120626.31,520,,46505.9,,405.0,49945.11,1810,217077.32
