In [5]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import filedialog
import time



# Create a root Tkinter window (hidden)
root = tk.Tk()
root.withdraw()

# Ask user to select a file
print("Select a CSV file to load into a Pandas DataFrame")
file_path = filedialog.askopenfilename(filetypes=[('CSV files', '*.csv')])

# Load selected file into Pandas DataFrame
df = pd.read_csv(file_path)

# Convert the Payment Date column to a Pandas DatetimeIndex
df['Payment Date'] = pd.to_datetime(df['Payment Date'])
df['Payment Total'] = pd.to_numeric(df['Payment Total'].str.replace(',', ''), errors='coerce')



# Check for missing or null values in the Payment Total column
print("Number of missing or null values in Payment Total column:", df['Payment Total'].isna().sum())

# print total number of rows and columns
print("Total number of rows:", len(df))
print("Total number of columns:", len(df.columns))
#print a table of columns and data types
print(df.dtypes)

# Filter data for transactions over 1000
filtered_df = df[df['Payment Total'] > 1000]
#print number of rows 
print("Number of rows after filtering:", len(filtered_df))

#choose the first 10000 rows
filtered_df = filtered_df.head(10000)



Select a CSV file to load into a Pandas DataFrame


  df = pd.read_csv(file_path)


Number of missing or null values in Payment Total column: 0
Total number of rows: 510999
Total number of columns: 23
Supplier Name                      object
Supplier Number                     int64
Payment Number                      int64
Payment Date               datetime64[ns]
Invoice Number                     object
Invoice Date                       object
Payment Total                     float64
Po Number                         float64
Receipt Number                    float64
Department                         object
Account Description                object
Fund Description                   object
Cost Center Description            object
Minority Owned                     object
Small Business                     object
Women Owned                        object
City                               object
State                              object
Zip                                object
Country                            object
Vendor Id                           int64
D

Load file, convert payment date and total to data types
Check total rows and columns and datatypes

In [6]:
def within_1_percent(df):
    """
    Filter data within 1% of each other in the Payment Total column and within 5 days of each other.
    """
    if len(df) >= 2:
        mean_payment = df['Payment Total'].mean()
        upper_limit = mean_payment * 1.01
        lower_limit = mean_payment * 0.99
        max_date = df['Payment Date'].max()
        min_date = df['Payment Date'].min()
        if (max_date - min_date).days <= 5:
            return df[(df['Payment Total'] <= upper_limit) & (df['Payment Total'] >= lower_limit)]
        else:
            return pd.DataFrame()
    else:
        return pd.DataFrame()
        
# Filter data for transactions over 1000
filtered_df = df[df['Payment Total'] > 1000]

#choose the first 10000 rows
#filtered_df = filtered_df.head(10000)

# Group by Supplier Name and filter for transactions within 1% of each other in the Payment Total column
grouped_df = filtered_df.groupby('Supplier Name').apply(within_1_percent).reset_index(drop=True)

# Exclude results with less than two rows per vendor
grouped_df = grouped_df.groupby('Supplier Name').filter(lambda x: len(x) >= 2)

# print total number of rows 
print("Total number of rows:", len(grouped_df))

# Remove empty dataframes from the grouped data
grouped_df = grouped_df[grouped_df['Payment Total'].notna()]



Total number of rows: 375


Filter for 1000
Filter for Same Supplier, +-5d Payment Date, and 1% payment total 

In [7]:
# Save the Excel file
print("Saving Excel file...")


# Create a Tkinter file save dialog box with default extension '.xlsx'
file_path = filedialog.asksaveasfilename(defaultextension='.xlsx', filetypes=[('Excel files', '*.xlsx')])

# Create an Excel writer object
writer = pd.ExcelWriter(file_path, engine='xlsxwriter')

# Write the filtered data to a tab
filtered_df.to_excel(writer, sheet_name='Filtered Data', index=False)

# Write the grouped data to a tab
grouped_df.to_excel(writer, sheet_name='Grouped Data', index=False)




print("Excel file saved successfully!")

# Close the Excel writer object
writer.close()

Saving Excel file...
Excel file saved successfully!


In [8]:
#print first 50 rows of the grouped data just name amount date
print(grouped_df[['Supplier Name', 'Payment Total', 'Payment Date']].head(50))

                               Supplier Name  Payment Total Payment Date
1                    ABB Instrumentation Inc        5895.43   2012-04-18
2                    ABB Instrumentation Inc        5895.43   2012-04-18
3                    AC Valve & Control Corp        1301.85   2014-05-20
4                    AC Valve & Control Corp        1301.85   2014-05-20
5              AMTEC Less-Lethal Systems Inc        1750.80   2017-04-10
6              AMTEC Less-Lethal Systems Inc        1750.80   2017-04-10
7                      AdVnt Biotechnologies        2010.15   2013-05-31
8                      AdVnt Biotechnologies        2010.15   2013-05-31
9                 Adams Fertilizer Equipment        3490.00   2020-02-19
10                Adams Fertilizer Equipment        3490.00   2020-02-19
11                  Advanced Engineered Pump       22379.99   2015-12-07
12                  Advanced Engineered Pump       22379.99   2015-12-07
13                               Agilest LLC       