In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

class DataExtractor:
    def __init__(self, invoices_file, expired_invoices_file):
        self.invoices_file = invoices_file
        self.expired_invoices_file = expired_invoices_file

    def load_data(self):
        try:
            self.invoices_data = pd.read_pickle(self.invoices_file)
            self.expired_invoices = pd.read_csv(self.expired_invoices_file, header=None)[0].tolist()
            return True
        except FileNotFoundError:
            print("File not found. Please provide correct file paths.")
            return False

    def extract_data(self):
        if not hasattr(self, 'invoices_data') or not hasattr(self, 'expired_invoices'):
            print("Data not loaded. Please load the data first.")
            return None
        
        flat_data = []
        for invoice in self.invoices_data:
            invoice_id = int(invoice['id'])
            created_on = pd.to_datetime(invoice['created_on'])

            for item in invoice['items']:
                item_id = item['item']['id']
                item_name = item['item']['name']
                unit_price = item['item']['unit_price']
                quantity = item['quantity']
                item_type = {0: 'Material', 1: 'Equipment', 2: 'Service', 3: 'Other'}.get(item['item']['type'], 'Unknown')
                total_price = unit_price * quantity
                percentage_in_invoice = total_price / sum([i['item']['unit_price'] * i['quantity'] for i in invoice['items']])
                is_expired = invoice_id in self.expired_invoices

                flat_data.append([invoice_id, created_on, item_id, item_name, item_type, unit_price, total_price, percentage_in_invoice, is_expired])

        columns = ['invoice_id', 'created_on', 'invoiceitem_id', 'invoiceitem_name', 'type', 'unit_price', 'total_price', 'percentage_in_invoice', 'is_expired']
        df = pd.DataFrame(flat_data, columns=columns)
        df.sort_values(by=['invoice_id', 'invoiceitem_id'], inplace=True)
        return df

# Usage example
extractor = DataExtractor("invoices_new.pkl", "expired_invoices.txt")
if extractor.load_data():
    result_df = extractor.extract_data()
    print(result_df.head())  # Print first few rows of the resulting dataframe
    result_df.to_csv("resulting_dataframe.csv", index=False)  # Save resulting dataframe to CSV


ValueError: invalid literal for int() with base 10: '365371O'