In [1]:
#meta 12/26/2023 Original LGL export for EZText vs Custom `HowDidYouHear` comparisons

#env myMBpro

#history
#12/26/2023 COMPARE COUNTS BETWEEN TWO REPORTS
#     local: get csvs from 2023-12-25
#     Original LGL export for EZText vs Custom `How Did You Hear About UDS``
#     Significant count differences


#References

In [2]:
import sys
import pandas as pd
import numpy as np

In [3]:
sys.version_info, pd.__version__, np.__version__

(sys.version_info(major=3, minor=10, micro=10, releaselevel='final', serial=0),
 '2.1.4',
 '1.26.3')

In [4]:
# VARS
FILE_IN = 'data/text_message_phone_number_list_2023-12-26.csv'
FILE_IN2 = 'data/custom_HowDidYouHearAboutUDS_v2 2023-12-25.csv'

# Compare: Counts 
- Original LGL export for EZText 
- Custom `How did you hear about UDS?`

## 0. Load data 
from CSV  
$note: problem with duplicates -> need a decision + cleanup effort

In [5]:
df_report = pd.read_csv(FILE_IN)

print(df_report.shape)
print(df_report.columns)
df_report.head();

(397, 5)
Index(['LGL Constituent ID', 'First Name', 'Last Name', 'Pref. Phone',
       'Date added'],
      dtype='object')


In [6]:
df_report2 = pd.read_csv(FILE_IN2)

print(df_report2.shape)
print(df_report2.columns)
df_report2.head();

(281, 11)
Index(['First Name', 'Last Name', 'Contact rpt. summary',
       'Contact rpt. description', 'Contact rpt. type', 'Contact rpt. date',
       'LGL Constituent ID', 'Home Phone', 'Home Email',
       'Contact rpt. team member', 'Last contact rpt.'],
      dtype='object')


- Important: unique Constituents count in `How did you hear ...`

In [7]:
df_report2['LGL Constituent ID'].value_counts()

LGL Constituent ID
946637    5
946372    4
945782    4
946707    3
946377    3
         ..
946152    1
946132    1
947077    1
947072    1
947067    1
Name: count, Length: 204, dtype: int64

## 1. Data Prep

### 1.1 Look for duplicates in Original LGL export for EZText

duplicates by `First Name` + `Pref. Phone`

In [8]:
include_cols = ['First Name', 'Pref. Phone']
df_report.duplicated().sum(), df_report.duplicated(subset = include_cols).sum()

(0, 11)

In [9]:
idx_duplicated = df_report.duplicated(subset = include_cols, keep=False) #mark all dups True
print((idx_duplicated == True).sum())
df_report.loc[idx_duplicated].sort_values(by = include_cols);

21


duplicates by `Pref. Phone`


In [10]:
include_cols = ['Pref. Phone']
df_report.duplicated().sum(), df_report.duplicated(subset = include_cols).sum()

(0, 19)

In [11]:
idx_duplicated = df_report.duplicated(subset = include_cols, keep=False) #mark all dups True
print((idx_duplicated == True).sum())
df_report.loc[idx_duplicated].sort_values(by = include_cols);

36


Findings:  
duplicate records -> need to clean up to have accurate Constituents data

### 1.2 Look for duplicates in custom report `How did you hear ...` 

In [12]:
include_cols = ['First Name', 'Last Name', 'Home Phone', 'Contact rpt. description']
df_report2.duplicated().sum(), df_report2.duplicated(subset = include_cols).sum()

(1, 4)

total duplicates

In [13]:
idx_duplicated = df_report2.duplicated(keep = False) #mark all dups True
print((idx_duplicated == True).sum())
df_report2.loc[idx_duplicated].sort_values(by = include_cols);

2


duplicates by `First Name`, `Last Name`, `Home Phone`, `Contact rpt. description`

In [14]:
idx_duplicated = df_report2.duplicated(subset = include_cols, keep=False) #mark all dups True
print((idx_duplicated == True).sum())
df_report2.loc[idx_duplicated].sort_values(by = include_cols);

8


duplicates by `First Name`, `Last Name`, `Home Phone`  
$note: displaying partial list only

In [15]:
include_cols = ['First Name', 'Last Name', 'Home Phone']
df_report2.duplicated().sum(), df_report2.duplicated(subset = include_cols).sum()

(1, 80)

In [16]:
idx_duplicated = df_report2.duplicated(subset = include_cols, keep=False) #mark all dups True
print((idx_duplicated == True).sum())
df_report2.loc[idx_duplicated].sort_values(by = include_cols);

145


Findings:  
duplicate records -> need to clean up to have accurate Constituents data

### 1.3 Tidy data
between the Original export  vs custom report `How did you hear ...` 

#### 1.3.1 Original tidy

$assumption: get rid of obvious duplicates in both reports, start with `First Name`, `Pref. Phone`


In [17]:
include_cols = ['First Name', 'Pref. Phone', 'LGL Constituent ID']
df_report_tidy = df_report[include_cols].copy()
df_report_tidy.drop_duplicates(keep = 'last', inplace= True)

print(df_report_tidy.shape)
print(df_report_tidy.columns)
df_report_tidy.head();

(397, 3)
Index(['First Name', 'Pref. Phone', 'LGL Constituent ID'], dtype='object')


#### 1.3.2 `How did you hear ...` tidy

$assumption: get rid of obvious duplicates in both reports, start with complete dups


In [18]:
df_report2_tidy = df_report2.copy()
df_report2_tidy.drop_duplicates(keep = 'last', inplace= True)

print(df_report2_tidy.shape)
print(df_report2_tidy.columns)
df_report2_tidy.head();

(280, 11)
Index(['First Name', 'Last Name', 'Contact rpt. summary',
       'Contact rpt. description', 'Contact rpt. type', 'Contact rpt. date',
       'LGL Constituent ID', 'Home Phone', 'Home Email',
       'Contact rpt. team member', 'Last contact rpt.'],
      dtype='object')


- `How did you hear ...` tidy

$assumption: get rid of obvious duplicates in both reports, start with duplicates by `First Name`, `Last Name`, `Home Phone`, `Contact rpt. description`

In [19]:
include_cols = ['First Name', 'Last Name', 'Home Phone', 'Contact rpt. description', 'LGL Constituent ID']
df_report2_tidy = df_report2_tidy[include_cols].copy()
df_report2_tidy.drop_duplicates(keep = 'last', inplace= True)

print(df_report2_tidy.shape)
print(df_report2_tidy.columns)
df_report2_tidy.head();

(277, 5)
Index(['First Name', 'Last Name', 'Home Phone', 'Contact rpt. description',
       'LGL Constituent ID'],
      dtype='object')


## 2. Analysis: Compare
### 2.0 Count differences 

Original export has more Constituents than custom `How did you hear ...`.  
Question: Why?

In [20]:
df_tidy_inner = df_report_tidy.merge(df_report2_tidy, left_on='LGL Constituent ID', right_on= 'LGL Constituent ID')
print(df_tidy_inner.shape)
df_tidy_inner;

(274, 7)


In [21]:
df_tidy_left = df_report_tidy.merge(df_report2_tidy, how = 'left', left_on='LGL Constituent ID', right_on= 'LGL Constituent ID')
print(df_tidy_left.shape)
df_tidy_left;

(469, 7)


In [22]:
df_tidy_right = df_report_tidy.merge(df_report2_tidy, how = 'right', left_on='LGL Constituent ID', right_on= 'LGL Constituent ID')
print(df_tidy_right.shape)
df_tidy_right;

(277, 7)


Findings:  
- Original export lists all the Constituents (including duplicates) even when they don't have recorded contact activity  
- Custom `How did you hear ...` report shows all the Constituents that have a recorded contact activity.

Probably: originally entered new Constituents into LGL daily and didn't have a consisten process to record contact / task / note activities.

Question: Who are the Constituents with no follow-up contact?  Should they show up in the latest custom `How did u ...` report?

In [23]:
r1_constituents = df_report_tidy['LGL Constituent ID'].unique()
r2_constituents = df_report2_tidy['LGL Constituent ID'].unique()

r1_constituents.shape, r2_constituents.shape 

((397,), (204,))

### 2.1 Constituents in Original LGL export but with no recorded Contact activity

In [24]:
len(set(r1_constituents) - set(r2_constituents)), set(r1_constituents) - set(r2_constituents)

(195,
 {945027,
  945032,
  945037,
  945042,
  945047,
  945052,
  945062,
  945067,
  945072,
  945077,
  945097,
  945117,
  945122,
  945127,
  945142,
  945152,
  945157,
  945162,
  945172,
  945177,
  945182,
  945187,
  945192,
  945202,
  945207,
  945212,
  945217,
  945222,
  945232,
  945237,
  945242,
  945247,
  945262,
  945267,
  945272,
  945277,
  945282,
  945287,
  945292,
  945302,
  945307,
  945312,
  945317,
  945322,
  945332,
  945342,
  945357,
  945362,
  945372,
  945377,
  945382,
  945387,
  945392,
  945407,
  945417,
  945422,
  945427,
  945432,
  945437,
  945442,
  945457,
  945462,
  945467,
  945472,
  945477,
  945482,
  945487,
  945492,
  945497,
  945507,
  945517,
  945522,
  945527,
  945537,
  945542,
  945547,
  945552,
  945557,
  945562,
  945567,
  945572,
  945582,
  945587,
  945592,
  945597,
  945602,
  945607,
  945617,
  945622,
  945627,
  945632,
  945637,
  945642,
  945647,
  945652,
  945657,
  945662,
  945667,
  945672,
  94

### 2.2 Constituents with recorded Contact activity but not in Original LGL export for EZText

In [25]:
len(set(r2_constituents) - set(r1_constituents)), set(r2_constituents) - set(r1_constituents)

(2, {945977, 946592})

In [26]:
mystop

NameError: name 'mystop' is not defined

## Xtra

In [None]:
#How to find set difference between two Numpy arrays?
# refer to https://www.tutorialspoint.com/how-to-find-set-difference-between-two-numpy-arrays

array_1 = np.array([2,4,6,8,10,12,14])
print("Array 1: \n", array_1)

array_2 = np.array([4,8,12])
print("\nArray 2: \n", array_2)

set_diff = np.setdiff1d(array_1, array_2)
print("\nThe set difference between array_1 and array_2 is:\n",set_diff)