# Customer transactions

## Imports

In [2]:
import pandas as pd
import numpy as np
import datetime
import re
from datetime import date
# from sqlalchemy import create_engine,VARCHAR, DATE

## Read the JSON file

In [3]:
df_customer_transaction = pd.read_json("customer_transaction_info.json")

# Profiling the Data


## Initial rows and columns

In [4]:
print(df_customer_transaction.shape) 

(130653, 5)


In [5]:
length = len(df_customer_transaction)
df_customer_transaction.head(length)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21


## Getting the key columns

In [6]:
print(df_customer_transaction.columns)

Index(['txn_id', 'avail_date', 'last_name', 'first_name', 'birthday'], dtype='object')


## Initial Data Checking

## Number Of Unique Transactions

In [7]:
print(df_customer_transaction['txn_id'].nunique())

62354


## Total Number of Duplicated Transactions (Ground for Duplication)

In [30]:
print(df_customer_transaction.duplicated().sum())

58298


## Null Checking for Transactions (For Removal)

In [8]:
filtered_txn = df_customer_transaction[df_customer_transaction['txn_id'].isnull()]
print(filtered_txn)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


## Date Checking for Future Avail Dates 

In [9]:
df_customer_transaction['avail_date'] = pd.to_datetime(df_customer_transaction['avail_date'])
filtered_dates = df_customer_transaction.loc[(df_customer_transaction['avail_date'] > datetime.datetime.now())]
print(filtered_dates)

           txn_id avail_date  last_name first_name    birthday
0       TXN-24546 2030-09-08      ORTIZ    EDUARDO  1990-07-08
1       TXN-14642 2026-05-26     NIENOW        LEA  2000-11-26
15      TXN-41934 2025-03-07       HANE     SYLVAN  2007-07-02
16      TXN-41934 2025-03-07       HANE     SYLVAN  2007-07-02
17      TXN-41934 2025-03-07       HANE     SYLVAN  2007-07-02
...           ...        ...        ...        ...         ...
130641  TXN-16095 2027-08-19  Bergstrom    Dedrick  2000-04-15
130642  TXN-16095 2027-08-19  Bergstrom    Dedrick  2000-04-15
130643  TXN-11897 2027-06-19    Reinger     Conrad  1996-12-03
130644  TXN-11897 2027-06-19    Reinger     Conrad  1996-12-03
130645  TXN-11897 2027-06-19    Reinger     Conrad  1996-12-03

[35539 rows x 5 columns]


## Date Checking for Future Birth Dates 

In [10]:
df_customer_transaction['birthday'] = pd.to_datetime(df_customer_transaction['birthday'])
filtered_dates = df_customer_transaction.loc[(df_customer_transaction['birthday'] > datetime.datetime.now())]
print(filtered_dates)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


## Date Checking for Avail Dates being earlier than Birth Dates

In [11]:
filtered_dates = df_customer_transaction.loc[(df_customer_transaction['birthday'] > df_customer_transaction['avail_date'])]
print(filtered_dates)

           txn_id avail_date last_name first_name   birthday
22      TXN-49423 2008-08-13     BERGE    RASHEED 2010-02-05
23      TXN-49423 2008-08-13     BERGE    RASHEED 2010-02-05
36      TXN-35244 2005-11-23    HAMMES     EUNICE 2006-06-13
37      TXN-35244 2005-11-23    HAMMES     EUNICE 2006-06-13
194     TXN-23898 2005-05-28  BOTSFORD     GIANNI 2009-03-30
...           ...        ...       ...        ...        ...
130512  TXN-53675 2005-03-29   Cormier  Adalberto 2008-11-06
130513  TXN-53675 2005-03-29   Cormier  Adalberto 2008-11-06
130595  TXN-30851 2006-03-21    Hessel     Kattie 2009-09-06
130596  TXN-30851 2006-03-21    Hessel     Kattie 2009-09-06
130597  TXN-30851 2006-03-21    Hessel     Kattie 2009-09-06

[4233 rows x 5 columns]


## Name Checking for nonAlphaNumeric Characters (For fixing)

In [12]:
filtered_names = df_customer_transaction[df_customer_transaction['first_name'].str.contains(r'[^\w\s]')]
print(filtered_names)

          txn_id avail_date        last_name       first_name   birthday
60000  TXN-04581 2019-09-03     Will,,,,,,,,  Aurelio........ 1993-07-12
60001  TXN-17309 2015-01-04  Krajcik,,,,,,,,    Ebony........ 2003-11-18
60002  TXN-18201 2021-04-28  Gutmann,,,,,,,,  Agustin........ 2009-03-11
60003  TXN-60631 2012-03-13   Kemmer,,,,,,,,     Alda........ 2004-08-18
60004  TXN-60631 2012-03-13   Kemmer,,,,,,,,     Alda........ 2004-08-18
...          ...        ...              ...              ...        ...
79995  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79996  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79997  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79998  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79999  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16

[20000 rows x 5 columns]


In [13]:
filtered_names = df_customer_transaction[df_customer_transaction['last_name'].str.contains(r'[^\w\s]')]
print(filtered_names)

          txn_id avail_date        last_name       first_name   birthday
60000  TXN-04581 2019-09-03     Will,,,,,,,,  Aurelio........ 1993-07-12
60001  TXN-17309 2015-01-04  Krajcik,,,,,,,,    Ebony........ 2003-11-18
60002  TXN-18201 2021-04-28  Gutmann,,,,,,,,  Agustin........ 2009-03-11
60003  TXN-60631 2012-03-13   Kemmer,,,,,,,,     Alda........ 2004-08-18
60004  TXN-60631 2012-03-13   Kemmer,,,,,,,,     Alda........ 2004-08-18
...          ...        ...              ...              ...        ...
79995  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79996  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79997  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79998  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16
79999  TXN-54391 2028-10-31    Johns,,,,,,,,   Turner........ 2010-04-16

[20000 rows x 5 columns]


## Name Checking for All Capitals (For fixing)

In [14]:
filtered_names = df_customer_transaction[df_customer_transaction['first_name'].str.isupper()]
print(filtered_names)

          txn_id avail_date last_name first_name   birthday
0      TXN-24546 2030-09-08     ORTIZ    EDUARDO 1990-07-08
1      TXN-14642 2026-05-26    NIENOW        LEA 2000-11-26
2      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
3      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
4      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
...          ...        ...       ...        ...        ...
19995  TXN-11721 2006-08-09     LESCH      VICKY 1999-11-16
19996  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19997  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19998  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19999  TXN-05451 2021-10-24  PROHASKA   BERNEICE 1993-01-11

[20000 rows x 5 columns]


In [15]:
filtered_names = df_customer_transaction[df_customer_transaction['last_name'].str.isupper()]
print(filtered_names)

          txn_id avail_date last_name first_name   birthday
0      TXN-24546 2030-09-08     ORTIZ    EDUARDO 1990-07-08
1      TXN-14642 2026-05-26    NIENOW        LEA 2000-11-26
2      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
3      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
4      TXN-60295 2006-09-25     LESCH      FLETA 1993-05-22
...          ...        ...       ...        ...        ...
19995  TXN-11721 2006-08-09     LESCH      VICKY 1999-11-16
19996  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19997  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19998  TXN-64348 2008-01-04    DURGAN     NANNIE 1995-04-22
19999  TXN-05451 2021-10-24  PROHASKA   BERNEICE 1993-01-11

[20000 rows x 5 columns]


## Name Checking for All Lowercase (For fixing)

In [16]:
filtered_names = df_customer_transaction[df_customer_transaction['first_name'].str.islower()]
print(filtered_names)

          txn_id avail_date last_name first_name   birthday
20000  TXN-05451 2021-10-24  prohaska   berneice 1993-01-11
20001  TXN-05451 2021-10-24  prohaska   berneice 1993-01-11
20002  TXN-02255 2010-03-26      hane        toy 2009-06-30
20003  TXN-02255 2010-03-26      hane        toy 2009-06-30
20004  TXN-02255 2010-03-26      hane        toy 2009-06-30
...          ...        ...       ...        ...        ...
39995  TXN-45930 2016-05-05      mann   vivianne 2000-12-19
39996  TXN-03162 2015-12-07  jacobson     barney 2006-12-08
39997  TXN-03162 2015-12-07  jacobson     barney 2006-12-08
39998  TXN-16572 2006-05-31  turcotte     jazmyn 1993-06-20
39999  TXN-16572 2006-05-31  turcotte     jazmyn 1993-06-20

[20000 rows x 5 columns]


In [17]:
filtered_names = df_customer_transaction[df_customer_transaction['last_name'].str.islower()]
print(filtered_names)

          txn_id avail_date last_name first_name   birthday
20000  TXN-05451 2021-10-24  prohaska   berneice 1993-01-11
20001  TXN-05451 2021-10-24  prohaska   berneice 1993-01-11
20002  TXN-02255 2010-03-26      hane        toy 2009-06-30
20003  TXN-02255 2010-03-26      hane        toy 2009-06-30
20004  TXN-02255 2010-03-26      hane        toy 2009-06-30
...          ...        ...       ...        ...        ...
39995  TXN-45930 2016-05-05      mann   vivianne 2000-12-19
39996  TXN-03162 2015-12-07  jacobson     barney 2006-12-08
39997  TXN-03162 2015-12-07  jacobson     barney 2006-12-08
39998  TXN-16572 2006-05-31  turcotte     jazmyn 1993-06-20
39999  TXN-16572 2006-05-31  turcotte     jazmyn 1993-06-20

[20000 rows x 5 columns]


## Null Checking for Dates (For Removal)

In [18]:
filtered_dates = df_customer_transaction[df_customer_transaction['birthday'].isnull()]
print(filtered_dates)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


In [19]:
filtered_dates = df_customer_transaction[df_customer_transaction['avail_date'].isnull()]
print(filtered_dates)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


## Null Checking for Names (For Removal)

In [20]:
filtered_names = df_customer_transaction[df_customer_transaction['first_name'].isnull()]
print(filtered_names)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


In [21]:
filtered_names = df_customer_transaction[df_customer_transaction['last_name'].isnull()]
print(filtered_names)

Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday]
Index: []


# Data Cleaning

## Drop Inconsistent/Wrong Birth and Avail Dates

In [22]:
df_customer_transaction['avail_date'] = pd.to_datetime(df_customer_transaction['avail_date'], errors='coerce')
df_customer_transaction['birthday'] = pd.to_datetime(df_customer_transaction['birthday'], errors='coerce')
df_filt = df_customer_transaction[(df_customer_transaction['avail_date'] <= pd.to_datetime(date.today()))]
df_filt = df_filt[(df_filt['birthday'] <= pd.to_datetime(date.today()))]
df_bday_avail = df_filt[(df_filt['birthday'] < df_filt['avail_date'])]

## Drop Duplicates

In [23]:
df_DropDuplicated_Customers = df_bday_avail.drop_duplicates()

## Fix Non Alpha First and Last

In [24]:
df_DropDuplicated_Customers['last_name'] = df_DropDuplicated_Customers['last_name'].str.replace(r'[^a-z0-9]', '', regex=True, flags = re.IGNORECASE)
df_DropDuplicated_Customers['first_name'] = df_DropDuplicated_Customers['first_name'].str.replace(r'[^a-z0-9]', '', regex=True, flags = re.IGNORECASE)

In [25]:
df_DropDuplicated_Customers.to_parquet('finalcustomerinfo.parquet')

In [26]:
df_DropDuplicated_Customers.head(30)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
2,TXN-60295,2006-09-25,Lesch,Fleta,1993-05-22
6,TXN-40462,2021-08-21,Kuhn,Tod,2002-11-25
8,TXN-08102,2010-04-03,Johnson,Milton,2003-07-10
12,TXN-64262,2018-11-23,Hackett,Maci,2005-06-20
24,TXN-59988,2021-02-01,Bashirian,Brice,2004-07-20
25,TXN-08158,2023-08-05,Batz,Timmothy,2008-10-06
30,TXN-37609,2022-09-04,Kunde,Mariana,1996-10-29
31,TXN-31294,2018-01-04,Breitenberg,Madge,2006-06-22
38,TXN-21732,2017-03-16,Turcotte,Hellen,1996-03-25
39,TXN-31732,2013-04-28,Lockman,Chadd,2003-06-17


## Export the Clean Data to SQL 

In [28]:
# df_DropDuplicated_Customers['avail_date'] = pd.to_datetime(df_customer_transaction['avail_date'], errors='coerce')
# df_DropDuplicated_Customers['birthday'] = pd.to_datetime(df_customer_transaction['birthday'], errors='coerce')
# engine = create_engine('sqlite:///customer_transactions.db', echo=True)
# nonAlphaNames.to_sql('customer_transactions', con=engine, if_exists='replace', index=False,
#                               dtype={
#                                   "txn_id": VARCHAR(10),
#                                   "avail_date" : DATE,
#                                   "last_name" : VARCHAR(20),
#                                   "first_name" : VARCHAR(20),
#                                   "birthday" : DATE,
#                               })