![](../additional_materials/logos/darden_rice_logo_SM.png)

#### Handling Duplicates in Call Sheets

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

import datetime

**BELOW:** Import master spreadsheet

In [2]:
df = pd.read_csv('../data/raw_call_sheets/women_call_sheet_wduplicates.csv')

In [3]:
df.head(3)

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
0,45696,Reynolds,Mitzi,Reynolds,,,,,,,Women
1,49081,Hogan,Constance,Ann,,8506871000.0,7278272000.0,8506871000.0,,,Women
2,50047,Gall,Holly,Kay,,,,,,,Women


In [4]:
df.shape

(21911, 11)

In [5]:
df.isnull().sum()

Voter File VANID        0
LastName                0
FirstName               0
MiddleName           1422
Suffix              21866
Cell Phone           8858
Home Phone           9398
Preferred Phone      3753
WorkPhone           21820
 WorkPhoneExt       21911
Group                   0
dtype: int64

In [6]:
# Drop duplicate VAN IDs and reindexing df
df.drop_duplicates('Voter File VANID', keep='last', inplace=True)

df.reset_index(drop=True, inplace=True)

In [7]:
df.shape

(21185, 11)

In [8]:
# Formatting phone numbers correctly, currently displayed in scientific notation
df['Preferred Phone'] = df['Preferred Phone'].fillna(0).apply(lambda x: '%d' % x).astype(int)

df['Cell Phone'] = df['Cell Phone'].fillna(0).apply(lambda x: '%d' % x).astype(int)

df['Home Phone'] = df['Home Phone'].fillna(0).apply(lambda x: '%d' % x).astype(int)

df['WorkPhone'] = df['WorkPhone'].fillna(0).apply(lambda x: '%d' % x).astype(int)

In [9]:
df.head()

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
0,45696,Reynolds,Mitzi,Reynolds,,0,0,0,0,,Women
1,49081,Hogan,Constance,Ann,,8506870943,7278272227,8506870943,0,,Women
2,50047,Gall,Holly,Kay,,0,0,0,0,,Women
3,64579,Dogu,Amanda,Marie,,0,0,0,0,,Women
4,68775,Loper,Mary,Joan,,0,7272894102,7272894102,0,,Women


In [10]:
# Columns to format
p_cols = ['Cell Phone', 'Home Phone', 'Preferred Phone', 'WorkPhone']

# Casting all phones as strings
df[p_cols] = df[p_cols].astype(str)

# Replacing 0s with blanks to match original format
df[p_cols] = df[p_cols].replace('0', '')

In [11]:
df.head()

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
0,45696,Reynolds,Mitzi,Reynolds,,,,,,,Women
1,49081,Hogan,Constance,Ann,,8506870943.0,7278272227.0,8506870943.0,,,Women
2,50047,Gall,Holly,Kay,,,,,,,Women
3,64579,Dogu,Amanda,Marie,,,,,,,Women
4,68775,Loper,Mary,Joan,,,7272894102.0,7272894102.0,,,Women


In [12]:
df['Preferred Phone'].value_counts()

              3680
7278040730       3
7273476151       3
7278642237       3
7277683653       3
              ... 
7275760380       1
7277415489       1
7275043839       1
7276438503       1
8135800565       1
Name: Preferred Phone, Length: 17304, dtype: int64

In [13]:
# Remove nulls from df to make dropping true duplicates easier
non_nulls_df = df[(df['Preferred Phone'] != '') & ((df['Cell Phone'] != '') | (df['Home Phone'] != '') | (df['WorkPhone'] != ''))]

In [14]:
non_nulls_df['Preferred Phone'].value_counts()

7278642237    3
7274245791    3
7277683653    3
7273476151    3
7278040730    3
             ..
7275760380    1
7277415489    1
7275043839    1
7273453326    1
7273248701    1
Name: Preferred Phone, Length: 17303, dtype: int64

In [15]:
non_nulls_df[non_nulls_df['Preferred Phone'] == '7274245791']

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
11199,12237291,Waters,Keion,Antell,,,7274245791.0,7274245791,,,Supporter
12489,20900596,Miller,Karon,,,7274245791.0,,7274245791,,,Supporter
12646,21201898,Tomlinson,Jeffery,J,,7274245791.0,,7274245791,,,Supporter


In [16]:
non_nulls_df[non_nulls_df['Preferred Phone'] == '7277683653']

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
436,1814573,Poirier,Joan,Ann,,7277683653,,7277683653,,,Women
2030,1921681,Poirier,Susan,Ann,,7277683653,,7277683653,,,Women
17826,37075796,Poirier,Jennifer,Elizabeth,,7277683653,,7277683653,,,Women


In [17]:
non_nulls_df[non_nulls_df['Preferred Phone'] == '7276470338']

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
5249,2138199,Updegraff,Elizabeth,Douglas,,7276470338,7278223937.0,7276470338,,,Women
14548,24885789,Updegraff,Hannah,Katherine,,7276470338,,7276470338,,,Women
17858,37111103,Updegraff,Julia,Marie,,7276470338,7278223937.0,7276470338,,,Women


In [18]:
non_nulls_df = non_nulls_df.drop_duplicates('Preferred Phone', keep='last')

In [19]:
non_nulls_df['Preferred Phone'].value_counts()

8133945339    1
7277438344    1
7276567951    1
7272386694    1
7277445348    1
             ..
5088641433    1
9013556364    1
7275198219    1
7274329786    1
7273248701    1
Name: Preferred Phone, Length: 17303, dtype: int64

In [20]:
non_nulls_df['Group'].value_counts()

Women        16215
Supporter     1088
Name: Group, dtype: int64

In [21]:
non_nulls_df.reset_index(drop=True, inplace=True)

---
---

In [22]:
non_nulls_df.to_csv('../data/processed_call_sheets/only_listed_numbers.csv')

---
---

#### No Phone Listed

In [23]:
# Entries with no phone listed
no_phone_df = df[(df['Preferred Phone'] == '') & (df['Home Phone'] == '') & (df['Cell Phone'] == '') & (df['WorkPhone'] == '')]

no_phone_df.reset_index(drop=True, inplace=True)

In [24]:
no_phone_df

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
0,45696,Reynolds,Mitzi,Reynolds,,,,,,,Women
1,50047,Gall,Holly,Kay,,,,,,,Women
2,64579,Dogu,Amanda,Marie,,,,,,,Women
3,212507,Black,Allison,,,,,,,,Women
4,254871,Bispo,Elena,Marie,,,,,,,Women
...,...,...,...,...,...,...,...,...,...,...,...
3653,40429037,Pepe,Anne Marie,,,,,,,,Women
3654,40436630,Rykaczewski,Carole,A,,,,,,,Women
3655,40520034,Supertino,Michelle,Nicole,,,,,,,Women
3656,40520131,Crooks,Louise,Grubbs,,,,,,,Women


---
---

In [25]:
# Saving voters with no phone numbers listed
no_phone_df.to_csv('../data/processed_call_sheets/no_phone_listed.csv', index=False)

---
---

In [26]:
master_df = pd.concat([non_nulls_df, no_phone_df])

master_df.reset_index(drop=True, inplace=True)

In [27]:
master_df.shape

(20961, 11)

In [28]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20961 entries, 0 to 20960
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Voter File VANID  20961 non-null  int64  
 1   LastName          20961 non-null  object 
 2   FirstName         20961 non-null  object 
 3   MiddleName        19590 non-null  object 
 4   Suffix            44 non-null     object 
 5   Cell Phone        20961 non-null  object 
 6   Home Phone        20961 non-null  object 
 7   Preferred Phone   20961 non-null  object 
 8   WorkPhone         20961 non-null  object 
 9    WorkPhoneExt     0 non-null      float64
 10  Group             20961 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.8+ MB


In [29]:
master_df

Unnamed: 0,Voter File VANID,LastName,FirstName,MiddleName,Suffix,Cell Phone,Home Phone,Preferred Phone,WorkPhone,WorkPhoneExt,Group
0,49081,Hogan,Constance,Ann,,8506870943,7278272227,8506870943,,,Women
1,68775,Loper,Mary,Joan,,,7272894102,7272894102,,,Women
2,111764,Cartwright,Rebecca,Allison,,,3863077973,3863077973,,,Women
3,113907,Astle,Allison,Marie,,7275048924,,7275048924,,,Women
4,122847,Rouse,Margaret,T,,3864511725,,3864511725,,,Women
...,...,...,...,...,...,...,...,...,...,...,...
20956,40429037,Pepe,Anne Marie,,,,,,,,Women
20957,40436630,Rykaczewski,Carole,A,,,,,,,Women
20958,40520034,Supertino,Michelle,Nicole,,,,,,,Women
20959,40520131,Crooks,Louise,Grubbs,,,,,,,Women


---
---

In [30]:
master_df.to_csv('../data/processed_call_sheets/master_call_sheet_no_duplicates.csv' ,index=False)

---
---

#### Supporters & Women Sheets

In [31]:
supporters_df = master_df[master_df['Group'] == 'Supporter']
supporters_df.reset_index(drop=True, inplace=True)

In [32]:
women_df = master_df[master_df['Group'] == 'Women']
women_df.reset_index(drop=True, inplace=True)

---
---

In [33]:
supporters_df.to_csv('../data/processed_call_sheets/supporters_no_duplicates.csv', index=False)
women_df.to_csv('../data/processed_call_sheets/women_no_duplicates.csv', index=False)

---
---

### Alternative Method

In [34]:
# df = pd.read_csv('../data/raw_call_sheets/women_call_sheet_wduplicates.csv')

In [35]:
# supporter_df = df[df['Group'] == 'Supporter']
# fem_df = df[df['Group'] == 'Women']

In [None]:
# master_df = 