# Cleaning

**We have cleaned the following in the given dataset subscription.csv :**
- Invalid dates
- Dropping duplicates
- Null or missing values
- Check for 0 or negative price values
- Remove dates in the future

**We have done the following for improved readability :**
- Rename Clients
- Rename ContactIds

In [1]:
# Imports
import pandas as pd

# ImpexiumInsights is the membership data
df = pd.read_csv('subscription.csv')
df['Client'] = df['Client'].replace('Client_3', 'NAIOP')

# Change clients to a more readable format as the following:
# Client_3 -> NAIOP
df.head(10)

Unnamed: 0,ContactId,ClientId,Client,ProductId,ProductType,EffectiveDate,ExpireDate,Total
0,0xD474CF42D6FEECADAC5AE0E44C19976B,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x3D5A19AE11AFFDB22AD9FFDD548D2E9F,Subscription,6/23/21,12/22/21,518.4
1,0xFB44C5545DC4AE04868E12D73D862634,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x3D5A19AE11AFFDB22AD9FFDD548D2E9F,Subscription,6/23/21,12/22/21,376.65
2,0xF495D2F04206745FD3BEDF273E2F4B56,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x5DC2BF4201BAA480B8B9E34BD2E8CC2E,Subscription,3/10/21,9/9/21,149.85
3,0x8216681574001C438CC9FF22BC43C873,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x3D86FFF3CCD71A7D8C33069E4FF64A97,Subscription,3/12/21,9/11/21,185.49
4,0xC7F1B0DF76E21A15DC9506D31F3B7170,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x5DC2BF4201BAA480B8B9E34BD2E8CC2E,Subscription,2/28/21,8/27/21,109.35
5,0x36913F1B51F4F312B2C10FBC4334BC69,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x3D86FFF3CCD71A7D8C33069E4FF64A97,Subscription,2/4/21,8/3/21,206.55
6,0xAAE00CF86BA9E1E636ABC433BB02F32A,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0x3D86FFF3CCD71A7D8C33069E4FF64A97,Subscription,10/2/20,4/1/21,206.55
7,0x215DA9F808D832EA1E0347464F3AF768,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0xC26C28C3E5CE081CD77B28FAD9884A8C,Subscription,12/4/20,6/3/21,265.68
8,0xF06E88279C9F90257664CB0C7BF1A284,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0xC26C28C3E5CE081CD77B28FAD9884A8C,Subscription,2/18/21,8/17/21,295.65
9,0xB2E83B0725AB3C956888FD4091CDB608,0x69CB4E2FB587F20F7AF21D60E989ADAB,NAIOP,0xC26C28C3E5CE081CD77B28FAD9884A8C,Subscription,6/18/21,12/17/21,265.68


In [2]:
# Removing all entries where EffectiveDate > ExpireDate
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'], format='%m/%d/%y')
df['ExpireDate'] = pd.to_datetime(df['ExpireDate'], format='%m/%d/%y')
valid_df = df[df['EffectiveDate'] <= df['ExpireDate']]

# Check 
invalid_df = valid_df[valid_df['EffectiveDate'] > valid_df['ExpireDate']]
print("Are there any entries in the new data frame that are invalid for dates?", not(invalid_df.to_xarray))


# Check passed so modify data frame
df = valid_df

Are there any entries in the new data frame that are invalid for dates? False


In [3]:
# Are there any prices that are 0 or negative?
invalid_rows = df[df['Total'] <= 0]

# Check
print("Are there any prices that are 0 or negative?", not(invalid_rows.to_xarray))

Are there any prices that are 0 or negative? False


In [4]:
# Dropping all duplicates that have the same ContactId, ProductId, EffectiveDate, and ExpireDate
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'], format='%m/%d/%y')
df['ExpireDate'] = pd.to_datetime(df['ExpireDate'], format='%m/%d/%y')
df_unique = df.drop_duplicates(subset=['ContactId', 'ProductId', 'EffectiveDate', 'ExpireDate'])

# Check 
duplicates_check = df_unique.duplicated(subset=['ContactId', 'ProductId', 'EffectiveDate', 'ExpireDate'], keep=False)
no_duplicates = not duplicates_check.any()
print("There are no duplicates in the new data frame, based on the specified columns above?", no_duplicates)

# Check passed so modify data frame
df = df_unique

There are no duplicates in the new data frame, based on the specified columns above? True


In [5]:
# Identifying any null or missing values
missing_entries = df[df.isnull().any(axis=1) | df.applymap(lambda x: x == '').any(axis=1)]

# Check
print("Are there any null or missing values in the data frame?", not(missing_entries.to_xarray))

Are there any null or missing values in the data frame? False


In [6]:
# Cleaning: removing all dates in the future.

# Cutoff date is May 2024
cutoff_date = pd.Timestamp('2024-05-01')
df_filtered = df[(df['EffectiveDate'] < cutoff_date) & (df['ExpireDate'] < cutoff_date)]
eff_check = df_filtered['EffectiveDate'].max()
exp_check = df_filtered['ExpireDate'].max()
print("Checks:")
print(eff_check < cutoff_date)
print(exp_check < cutoff_date)

# Check passed so modify data frame
df = df_filtered

Checks:
True
True


In [7]:
# Generate a unique "Member X" identifier for each unique ContactId
temp_df = df.copy()
unique_ids = temp_df['ContactId'].unique()
member_mapping = {id_: f"Member {i+1}" for i, id_ in enumerate(unique_ids)}
temp_df['ContactId'] = temp_df['ContactId'].map(member_mapping)

# Check
print(temp_df.head(5))

# Check passed so modify data frame
df = temp_df

  ContactId                            ClientId Client  \
0  Member 1  0x69CB4E2FB587F20F7AF21D60E989ADAB  NAIOP   
1  Member 2  0x69CB4E2FB587F20F7AF21D60E989ADAB  NAIOP   
2  Member 3  0x69CB4E2FB587F20F7AF21D60E989ADAB  NAIOP   
3  Member 4  0x69CB4E2FB587F20F7AF21D60E989ADAB  NAIOP   
4  Member 5  0x69CB4E2FB587F20F7AF21D60E989ADAB  NAIOP   

                            ProductId   ProductType EffectiveDate ExpireDate  \
0  0x3D5A19AE11AFFDB22AD9FFDD548D2E9F  Subscription    2021-06-23 2021-12-22   
1  0x3D5A19AE11AFFDB22AD9FFDD548D2E9F  Subscription    2021-06-23 2021-12-22   
2  0x5DC2BF4201BAA480B8B9E34BD2E8CC2E  Subscription    2021-03-10 2021-09-09   
3  0x3D86FFF3CCD71A7D8C33069E4FF64A97  Subscription    2021-03-12 2021-09-11   
4  0x5DC2BF4201BAA480B8B9E34BD2E8CC2E  Subscription    2021-02-28 2021-08-27   

    Total  
0  518.40  
1  376.65  
2  149.85  
3  185.49  
4  109.35  


In [8]:
# Saving updated data frame to the csv
# Always run this at the end of your work session:
df.to_csv('modified_subscription.csv', index=False)