In [89]:
import pandas as pd

# ImpexiumInsights is the membership data
df = pd.read_csv('membership.csv')
df.head(10)

# Change clients to a more readable format as the following:
# Client_1 -> ICSC
# Client_2 -> SIOR
# Client_3 -> NAIOP
df['Client'] = df['Client'].replace('Client_1', 'ICSC')
df['Client'] = df['Client'].replace('Client_2', 'SIOR')
df['Client'] = df['Client'].replace('Client_3', 'NAIOP')
df.head(10)

Unnamed: 0,ContactId,ClientId,Client,ProductId,ProductType,EffectiveDate,ExpireDate,Total
0,0x566B63D73878FD95925F7B6716FAB095,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,9/1/21,8/31/22,104.49
1,0x77D707DC7112E8379C27E3A1AF0EC034,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,8/1/21,7/31/22,104.49
2,0xC36639462513FE7C2413AF7449B281BD,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0xCF82E756F655A4E1518567DCDB80CCBF,Membership,6/27/21,6/26/22,36.45
3,0xC36639462513FE7C2413AF7449B281BD,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,6/19/21,6/18/22,104.49
4,0xBDF36B8927A18BC973A2AE91E7F5112F,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,6/27/21,6/26/22,104.49
5,0x18CC10100F771A69BFD13098384C7E46,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,8/2/21,8/1/22,104.49
6,0x0E49629AB8D42E50157CFDE15186FD8B,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,6/29/21,6/28/22,104.49
7,0xB584C9347DE96FFE23C5BD103626F03F,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,6/28/21,6/27/22,104.49
8,0x7181625DB00B71EAA5FEB7C1632F92EB,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,6/28/21,6/27/22,104.49
9,0xD35391A8B2400414D05B610DBC052102,0xCAB892266CB7872B20CED441410BFF5E,SIOR,0x9E8B160226C9FE22A910C782CE5076E2,Membership,1/1/20,12/31/20,104.49


In [90]:
# Removing all entries where EffectiveDate > ExpireDate
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'], format='%m/%d/%y')
df['ExpireDate'] = pd.to_datetime(df['ExpireDate'], format='%m/%d/%y')
valid_df = df[df['EffectiveDate'] <= df['ExpireDate']]

# Check 
invalid_df = df[df['EffectiveDate'] > df['ExpireDate']]
print("Are there any entries in the new data frame that are invalid for dates?", not(invalid_df.to_xarray))


# Check passed so modify data frame
df = valid_df

Are there any entries in the new data frame that are invalid for dates? False


In [91]:
# Can different people be members of the same ProductId at the same time?
unique_product_counts = df.groupby('ContactId')['ProductId'].nunique()
multiple_subscriptions = unique_product_counts[unique_product_counts > 1]

# Check
print("Can different people be members of the same ProductId at the same time?", not(not({multiple_subscriptions.to_dict})))

Can different people be members of the same ProductId at the same time? True


In [92]:
# Dropping all duplicates that have the same ContactId, ProductId, EffectiveDate, and ExpireDate
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'], format='%m/%d/%y')
df['ExpireDate'] = pd.to_datetime(df['ExpireDate'], format='%m/%d/%y')
df_unique = df.drop_duplicates(subset=['ContactId', 'ProductId', 'EffectiveDate', 'ExpireDate'])

# Check 
duplicates_check = df_unique.duplicated(subset=['ContactId', 'ProductId', 'EffectiveDate', 'ExpireDate'], keep=False)
no_duplicates = not duplicates_check.any()
print("There are no duplicates in the new data frame, based on the specified columns above?", no_duplicates)

# Check passed so modify data frame
df = df_unique

There are no duplicates in the new data frame, based on the specified columns above? True


In [93]:
# Identifying any null or missing values
missing_entries = df[df.isnull().any(axis=1) | df.applymap(lambda x: x == '').any(axis=1)]

# Check
print("Are there any null or missing values in the data frame?", not(missing_entries.to_xarray))

Are there any null or missing values in the data frame? False


In [94]:
# Changing contact_ids

# unique_contact_ids = df['ContactId'].unique()
# contact_id_mapping = {id_: f'Member {i+1}' for i, id_ in enumerate(unique_contact_ids)}
# df['ContactId'] = df['ContactId'].map(contact_id_mapping)
# df

In [95]:
# Saving updated data frame to the csv
# Always run this at the end of your work session:

df.to_csv('modified_membership.csv', index=False)

In [96]:
# Membership Ranges
# Come back to this after cleaning
# def assign_membership(total):
#     if total < 350:
#         return 'Standard'
#     elif 350 <= total < 700:
#         return 'Silver'
#     elif total >= 700:
#         return 'Gold'

# product_totals = df.copy()
# product_totals['Membership'] = product_totals['Total'].apply(assign_membership)
# product_id_to_membership = product_totals.set_index('ProductId')['Membership'].to_dict()


# df['ProductId'] = df['ProductId'].map(product_id_to_membership)