## ADA: Wedge Process
This repository builds on our Wedge Exploration exercise. This exercise will help you carry out the Wedge project at an A level.

You'll write code that carries out the following steps:

Create an empty data frame called wedge_summary with the following columns: file_name, num_rows, num_cards, num_dates
Iterate over the zip files that hold the Wedge transaction files
Unzip each file one at a time (so this will be part of a for loop)
Use the CSV sniffer to determine the delimiter and whether or not there is a header row.
Read, or attempt to read, the file into a Pandas dataframe, using the delimiter and handling headers correctly.
For each file, store a row in wedge_summary that holds the values listed above. num_cards should be the unique card numbers in the file and num_dates should be the number of dates.

In [1]:
import os
import zipfile
import pandas as pd
import csv

In [None]:
data_dir = 'data/WedgeZipOfZips_small/'

The follwoing block of code unzips each zipped file and saves the unzipped file in the 'extracted' folder under the parents directory.

In [None]:
for file in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file)  # Construct full path
    if os.path.isfile(file_path) and 'transArchive_' in file_path:  # Check if it's a file, not a directory
        with zipfile.ZipFile(file_path) as my_zip:
            for zipped_file in my_zip.namelist():
                my_zip.extract(zipped_file, path='data/WedgeZipOfZips_small/extracted')

In [2]:
# Function to analyze the CSV file using csv.Sniffer
def sniff_csv(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        # Read a sample of the file to allow sniffing
        sample = csvfile.read(1024 * 4)
        # Create a Sniffer object
        sniffer = csv.Sniffer()
        # Determine if there is a header
        has_header = sniffer.has_header(sample)
        # Sniff the delimiter
        dialect = sniffer.sniff(sample)
        return dialect.delimiter, has_header


In [None]:
# Unify NULL characters
for file in os.listdir('data/WedgeZipOfZips_small/extracted'):
    file_path = os.path.join('data/WedgeZipOfZips_small/extracted/', file)
    if os.path.isfile:
        with open(file_path, 'r') as infile:
            # Read the entire content of the file
            content = infile.read()

        # Replace occurrences of '/N' and '//N' with 'NULL'
        modified_content = content.replace(r'\N', 'NULL').replace(r'\\N', 'NULL')

        # Open the same file in write mode to overwrite it with the modified content
        with open(file_path, 'w') as outfile:
            # Write the modified content back to the file
            outfile.write(modified_content)

In [3]:
# Create an empty summary table
wedge_summary = pd.DataFrame(columns = ['file_name', 'num_rows', 'num_cards', 'num_dates'])

In [4]:
data_dir = 'data/WedgeZipOfZips/extracted/'

In [5]:
col_names = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description',
       'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity',
       'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax',
       'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount',
       'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty',
       'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType',
       'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag',
       'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no',
       'store', 'branch', 'match_id', 'trans_id']

This next chunk opens each csv in the extracted folder and adds meta data into the summary df, than exports table as summary.csv

In [6]:
summary_data = []  # List to hold summary data

for file in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file)
    if os.path.isfile(file_path):
        # Sniff the CSV file
        delimiter, has_header = sniff_csv(file_path)
        
        if not has_header:
            df = pd.read_csv(file_path, delimiter=delimiter, header=None, names=col_names)
        else:
            df = pd.read_csv(file_path, delimiter=delimiter, header=0)

        # Clean column names
        df.columns = df.columns.str.strip()  # Remove any leading/trailing spaces

        # Replace null characters with uniform null value
        df.replace({'/N': 'NULL', '//N': 'NULL'}, inplace=True)
        
        # Extract the columns of interest
        num_rows = df.shape[0]
        num_cards = df['card_no'].nunique() if 'card_no' in df.columns else 0
        
        df['datetime'] = pd.to_datetime(df['datetime']) # CHANGE TO DATETIME
        num_dates = df['datetime'].dt.date.nunique() if 'datetime' in df.columns else 0

        # Add the file summary to the summary list
        summary_data.append({'file_name': file, 'num_rows': num_rows, 'num_cards': num_cards, 'num_dates': num_dates})

# Create a DataFrame from the summary list
wedge_summary = pd.DataFrame(summary_data)

# Save the summary to CSV
wedge_summary.to_csv('data/summary.csv', index=False)



  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=None, names=col_names)
  df = pd.read_csv(file_path, delimiter=delimiter, header=None, names=col_names)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delimiter, header=0)
  df = pd.read_csv(file_path, delimiter=delim