In [1]:
import pandas as pd
import numpy as np
from string import punctuation

In [2]:
# Read in the transactions csv containing data on customer stock transactions
df = pd.read_csv('transactions.csv')

In [3]:
# Rename the columns currently in the data frame in order to remove the space between words in the column titles
df = df.rename(columns = {'Account ID' : 'AccountID', 'Transaction ID' : 'TransactionID', 'Account Type' : 'AccountType', 
                          'Trade Date' : 'TradeDate', 'Settlement Date' : 'SettlementDate', 'Price Per Share' : 'PricePerShare', 'Status' : 'TransactionStatus'})

In [4]:
# Rename the columns in order to remove the space so we may work with it more easily
df[['Name', 'Asset', 'Ticker', 'AccountType', 'PricePerShare', 'TransactionStatus', 'Notes']] = (
df[['Name', 'Asset', 'Ticker', 'AccountType', 'PricePerShare', 'TransactionStatus', 'Notes']].astype(str).apply(lambda col: col.str.upper()))

In [5]:
# Drop all rows where the row has an exact copy in every entry in some other row in the table
df = df.drop_duplicates()

In [6]:
# Check if there are rows that don't have exact duplicates in every entry but still have a duplicate "TransactionID'
# Because this should be a unique value that only ever appears in one row
df[df['TransactionID'].duplicated()]

Unnamed: 0,Name,AccountID,TransactionID,AccountType,Asset,Ticker,Quantity,TradeDate,SettlementDate,PricePerShare,Currency,TransactionStatus,Notes
1121,NAN,,,NAN,NAN,NAN,,,,NAN,,NAN,NAN


In [7]:
# Since the only repeated 'TransactionID' was NaN we will look so see all the rows with this repeat ID
df[df['TransactionID'].isna() == 1]

Unnamed: 0,Name,AccountID,TransactionID,AccountType,Asset,Ticker,Quantity,TradeDate,SettlementDate,PricePerShare,Currency,TransactionStatus,Notes
1120,TOTAL,,,NAN,NAN,NAN,,,,"$3,245,000",,NAN,NAN
1121,NAN,,,NAN,NAN,NAN,,,,NAN,,NAN,NAN


In [8]:
# Beause all of the rows with this repeated 'TransactionID' of NaN (id 1120 and 1121) are just garbage rows, we will delete them
# Also, the row beneath these two rows is a garbage row filled just with * so we will delete it too
df = df.drop([1120, 1121, 1122], axis=0)

In [9]:
# Get rid of any units in the 'PricePerShare' column because the currency column is where the actual units are
df['PricePerShare'] = df['PricePerShare'].str.strip(punctuation).str.split('U').str[0].str.strip()

In [10]:
# Replace 'ten' with '10' in the 'Quantity' column
df['Quantity'] = df['Quantity'].str.replace('ten', '10')

In [11]:
# Get only the number in the 'AccountID' column by removing the 'ACC' and hyphens, which are unnecessary because the numbers themselves 
# provide all the information we need
df['AccountID'] = df['AccountID'].str.split('-').str[1] + df['AccountID'].str.split('-').str[2]

In [12]:
# If the ticker symbol is not in the list of correct symbols for all the possible stocks bought and sold, change the ticker to the right one
# Specifically, we change those tickers where the last character was left off
df[df['Ticker'].isin(['AAPL', 'NFLX', 'GOOGL', 'GOOG', 'MSFT', 'NVDA', 'AMZN', 'TSLA', 'META', 'BRK.A', 'JNJ']) == 0] = (
df[df['Ticker'].isin(['AAPL', 'NFLX', 'GOOGL', 'GOOG', 'MSFT', 'NVDA', 'AMZN', 'TSLA', 'META', 'BRK.A', 'JNJ']) == 0]
    .replace('NVD', 'NVDA').replace('MET', 'META').replace('AAP', 'AAPL').replace('BRK.', 'BRK.A').replace('JN', 'JNJ')
    .replace('AMZ', 'AMZN').replace('MSF', 'MSFT').replace('NFL', 'NFLX').replace('TSL', 'TSLA'))

In [13]:
# Chnage all dates in all the dates columns to be in the format yyyy-mm-dd

columns = ['TradeDate', 'SettlementDate']

for column in columns:

    # If the date appears as month dd, yyyy then change the month spelled out to the numeric mm it corresponds to
    df[column][df[column].str.split('-').str[1].isna() == 0] = (
        df[column][df[column].str.split('-').str[1].isna() == 0].str.replace('Jan', '01').str.replace('Feb', '02')
        .str.replace('Mar', '03').str.replace('Apr', '04').str.replace('May', '05').str.replace('Jun', '06').str.replace('Jul', '07')
        .str.replace('Aug', '08').str.replace('Sep', '09').str.replace('Oct', '10').str.replace('Nov', '11').str.replace('Dec', '12')
    )

    # Define month_dd_yyyy_dates_dash as the dates that appear as day-month-year
    month_dd_yyyy_dates_dash_df = df[column][df[column].str.split('-').str[1].isna() == 0]
    # Set those dates that appear as month, dd, yyyy to be in the form yyyy-mm-dd
    df[column][df[column].str.split('-').str[1].isna() == 0] = (
        month_dd_yyyy_dates_dash_df.str.split('-').str[2] + '-' + month_dd_yyyy_dates_dash_df.str.split('-').str[1] + '-' +
        month_dd_yyyy_dates_dash_df.str.split('-').str[0]
    )

    # Define mm_dd_yyyy_dates_slash as those dates in the 'DateCompleted' column that look like mm/dd/yyyy
    mm_dd_yyyy_dates_slash_df = df[column][df[column].str.split('/').str[1].isna() == 0]
    # Change the dates that look like mm/dd/yyyy to be yyyy-mm-dd
    df[column][df[column].str.split('/').str[1].isna() == 0] = (
        mm_dd_yyyy_dates_slash_df.str.split("/").str[2] + '-' + mm_dd_yyyy_dates_slash_df.str.split("/").str[0] + '-' + mm_dd_yyyy_dates_slash_df.str.split("/").str[1])
    
    # Since the placeholder 0 is missing from some of these dates, we fill them in in order to get two digits for day and month in each date
    df[column] = df[column].str.split('-').str[0] + '-' + df[column].str.split('-').str[1].str.zfill(2) + '-' + df[column].str.split('-').str[2].str.zfill(2)

In [14]:
# Remove any honorifics from the names such as 'Mr.', 'Dr.', DDS, etc
formal_titles_df = df['Name'][df['Name'].str.split('.').str[1].isna() == 0]
df['Name'][df['Name'].str.split('.').str[1].isna() == 0] = (formal_titles_df.str.split('.').str[1].str.split(' ').str[1] + ' ' + 
formal_titles_df.str.split('.').str[1].str.split(' ').str[2])

In [15]:
# Separate the 'Name' column into first and last name columns and then remove the original 'Name' column
df['FirstName'] = df['Name'].str.split(' ').str[0]
df['LastName'] = df['Name'].str.split(' ').str[1]
df = df.drop('Name', axis=1)

In [16]:
# Replace the comma in the PricePerShare column so that we may convert it to a number later and use it as an int
df['PricePerShare'] = df['PricePerShare'].str.replace(',', '')

In [17]:
# Delete the whitespace before and after every entry in every column where the quantity is of type 'object'
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

In [18]:
# Replace any pseudonull values with None so that when converting to sql it is easier to work with
df = df.replace(['NAN', 'NaN', np.nan], None)

In [21]:
df.to_csv(r'C:\Users\beanw\OneDrive\Desktop\finance_project\cleaned_transactions.csv', index=False) 