# Capstone Project

## Park Slope Parents Membership
#### Part 2a: Data Cleanup

Step 1. Load libraries

In [162]:
import csv
import numpy as np
import pandas as pd 
from datetime import datetime


Step 2. Import Data

In [163]:
memdf1 = pd.read_csv("../../projects/psp/raw_data/PSP_MembershipData_1.csv")
                     
memdf2 = pd.read_csv("../../projects/psp/raw_data/PSP_MembershipData_2.csv")


In [164]:
memdf1.head()

Unnamed: 0,member_number,first_name,last_name,primary_email,address1,address2,city,state,zip,country,...,chapter,last_renewal_date,secondary_type_name,member_level,gender,allow_club_email,printed_newsletter,mailing_name,use_alt_address,primary_member_number
0,16081,Sonja,Neill-Turner,sonjaneill5926@msn.com,231 15th Street,UNIT 3B,Brooklyn,NY,11215,UNITED STATES,...,ClubExpress - Club and Association Management ...,5/10/16,,Primary,Female,Yes,No,SONJA NEILL-TURNER,No,16081
1,21186,Kim,Mason,Missmerendino@aol.com,491 12th street-apt3L,,Brooklyn,NY,11215,UNITED STATES,...,ClubExpress - Club and Association Management ...,11/10/14,,Primary,Female,Yes,No,Kim Mason,No,21186
2,34999,David,Mcgovern,Mcgoverndb@yahoo.com,595 3rd Street,,Brooklyn,NY,11218,UNITED STATES,...,,,,Primary,Male,No,No,David B Mcgovern & Steven Pollack,No,34999
3,35041,Susan,Kim,sueandsteve618@gmail.com,303 3rd St Apt 2L,,Brooklyn,NY,11215,UNITED STATES,...,,,,Primary,Female,No,No,Susan Kim & Steve Park,No,35041
4,31116,Beryl,McCadden-Crawford,bdgirll87@gmail.com,568 7th Ave,,Brooklyn,NY,11215,UNITED STATES,...,,,,Primary,Female,Yes,No,Beryl D McCadden-Crawford & Robert Crawford,No,31116


In [165]:
memdf1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17682 entries, 0 to 17681
Data columns (total 46 columns):
member_number            17682 non-null object
first_name               17682 non-null object
last_name                17682 non-null object
primary_email            17673 non-null object
address1                 17578 non-null object
address2                 11064 non-null object
city                     17578 non-null object
state                    17569 non-null object
zip                      17578 non-null object
country                  17682 non-null object
phone                    14248 non-null object
cell_phone               6423 non-null object
fax                      534 non-null object
login_name               17682 non-null object
middle_initial           4328 non-null object
nickname                 3700 non-null object
alt_address1             0 non-null float64
alt_address2             0 non-null float64
alt_city                 0 non-null float64
alt_state   

In [166]:
# I don't particularly need their names, emails, street address, apartment number,\
# country, phone numbers...you know what, at this point it'll be easier for me to just make\
# a list of the things that I will need.

mem1_list = ['member_number','address1', 'city', 'state','country', 'zip','date_joined','date_expired',\
             'status','membership_type','primary_member','last_renewal_date', 'member_level','gender'\
             ,'allow_club_email','printed_newsletter','primary_member_number']
mem1_filtered_df = pd.DataFrame(memdf1[mem1_list])


In [167]:
# scrubbing grabbing new information from here

mem2_list = ['member_number', 'visibility','DUPLICATE','Parent Status',\
             'No of kids','Bday of 1st kid','Bday of 2nd kid','Join A PSP Pregnancy  Baby Group',\
             'Reason JoiningSpecial Circumstances','ADVICE GROUP EMAIL','CLASSIFIEDS EMAIL',\
             'CLASSIFIED SPOUSE EMAIL','Time Out New York Kids','PSPKnowledge']
mem2_filtered_df = pd.DataFrame(memdf2[mem2_list])

In [168]:
# merge the dataframes

df = pd.merge(mem1_filtered_df, mem2_filtered_df, how='inner', on='member_number')

In [169]:
df.isnull().sum()

member_number                              0
address1                                  77
city                                      77
state                                     85
country                                    0
zip                                       77
date_joined                               62
date_expired                               0
status                                     0
membership_type                            0
primary_member                             0
last_renewal_date                       7373
member_level                               0
gender                                     0
allow_club_email                           0
printed_newsletter                         0
primary_member_number                      0
visibility                             15134
DUPLICATE                               5335
Parent Status                            504
No of kids                               501
Bday of 1st kid                          454
Bday of 2n

In [170]:
# clean some column names

df.rename(columns ={'member_number':'mem_no', 'address1':'address','date_joined':'joined','date_expired':'exp_date',\
                    'membership_type':'mem_type','primary_member':'is_primary','member_level':'mem_lvl',\
                    'allow_club_email':'club_email','printed_newsletter':'newsletter',\
                    'primary_member_number':'prm_mem_nmbr','visibility':'vis',\
                    'DUPLICATE':'dup','Parent Status':'parent_status','No of kids':'kid_count',\
                    'Bday of 1st kid':'kid1_bday','Bday of 2nd kid':'kid2_bday',\
                    'Join A PSP Pregnancy  Baby Group':'baby_group',\
                    'Reason JoiningSpecial Circumstances':'join_reason','ADVICE GROUP EMAIL':'advice_grp',\
                    'CLASSIFIEDS EMAIL':'classifieds','CLASSIFIED SPOUSE EMAIL':'classifieds_spouse',\
                    'Time Out New York Kids':'tony_kids','PSPKnowledge':'discovered'}, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15521 entries, 0 to 15520
Data columns (total 30 columns):
mem_no                15521 non-null object
address               15444 non-null object
city                  15444 non-null object
state                 15436 non-null object
country               15521 non-null object
zip                   15444 non-null object
joined                15459 non-null object
exp_date              15521 non-null object
status                15521 non-null object
mem_type              15521 non-null object
is_primary            15521 non-null object
last_renewal_date     8148 non-null object
mem_lvl               15521 non-null object
gender                15521 non-null object
club_email            15521 non-null object
newsletter            15521 non-null object
prm_mem_nmbr          15521 non-null object
vis                   387 non-null float64
dup                   10186 non-null object
parent_status         15017 non-null object
kid_count    

In [171]:
# adding 0s to head of member numbers to make sorting even

df['mem_no'] = df['mem_no'].apply(lambda x: '{0:0>5}'.format(x))

# sorting by member number

df.sort_values(by='mem_no', axis=0, ascending=True, inplace=True)

df.head()

Unnamed: 0,mem_no,address,city,state,country,zip,joined,exp_date,status,mem_type,...,kid_count,kid1_bday,kid2_bday,baby_group,join_reason,advice_grp,classifieds,classifieds_spouse,tony_kids,discovered
138,1,9229 Shore Road,Brooklyn,NY,UNITED STATES,11209.0,,5/21/20,Active,Lifetime Member,...,2,7/13/01,7/28/92,11/19/11,,rachelfran@yahoo.com,rachelfran@yahoo.com,rachel@parkslopeparents.com,Yes,A PSP member who is a friend/neighbor
14,2,438 12th Street,Brooklyn,NY,UNITED STATES,11215.0,2/16/09,2/15/20,Active,Lifetime Member,...,2,4/14/85,,10/12/11,dfs,susan@parkslopeparents.com,susan@parkslopeparents.com,sfox19jim@yahoo.com,Yes,A PSP member who is a friend/neighbor
169,4,580 5th Street,Brooklyn,NY,UNITED STATES,11215.0,4/13/09,4/12/20,Active,Lifetime Member,...,2,12/11/02,,,,nancy.a.mcdermott@gmail.com,nancy.a.mcdermott@gmail.com,,Yes,A PSP member I don't know told me about it
110,6,190 Garfield Place,Brooklyn,NY,UNITED STATES,11215.0,4/16/09,4/15/20,Active,Lifetime Member,...,,3/12/07,3/4/09,,PSP rocks!,eisen.lb@gmail.com,eisen.lb@gmail.com,,Yes,A PSP member who is a friend/neighbor
181,69,,,NY,UNITED STATES,,4/13/09,4/12/19,Active,Lifetime Member,...,Not quite one yet (pregnant or adopting),4/13/09,,,,membership@parkslopeparents.com,membership@parkslopeparents.com,pspmods@yahoo.com,Yes,A PSP member who is a friend/neighbor


Step 3. Clean Up

In [172]:
# Checking address needs for cleanup

tlist = df.address.unique()
print len(tlist)
# print sorted(tlist)


# This looks like it might be especially messy. I'll strip out the whitespaces in front and back and # signs

df['address'] = df['address'].str.strip().str.lower().str.replace(' ', '_').str.replace('#','')

# one nan address--eliminating

df['address'].fillna('NA', inplace=True)
df = pd.DataFrame(df.loc[df['address'] != "NA"])

# my hope is that i will eventually be able to use the addresses to establish a radius from PS that people come
# (using SF's address as the locus). I will return to clean here time permitting, but it seems like it will be a
# very manual process since some folks have included apartments in the address 1 field rather than address 2 and others
# just end the address without using street / ave, etc. It's a mess and I don't know if I can't get my value well
# enough from using zips, so I'll do that unless it looks like that won't work.



12737


In [173]:
# Checking city needs for cleanup



# It looks like there's some spelling / capitalization issues. 

# cleaning out whitespaces leading or trailing, converting internal whitespaces to underscores,\
# and converting all upper case to lower case

df['city'] = df['city'].str.strip().str.lower().str.replace(' ', '_')

# the two zip codes are both in brooklyn and we'll correct all of the misspellings


df['city'].replace('11201', 'brooklyn', inplace=True)
df['city'].replace('11215', 'brooklyn', inplace=True)
df['city'].replace('astotia', 'astoria', inplace=True)

# I never knew there were so many ways to misspell brooklyn. I literally thought I'd forgotten the
# correct spelling at one point.

df['city'].replace('brookyln,', 'brooklyn', inplace=True)
df['city'].replace('brooklyn,', 'brooklyn', inplace=True)
df['city'].replace('brookyn', 'brooklyn', inplace=True)
df['city'].replace('broolkyn', 'brooklyn', inplace=True)
df['city'].replace('broolyn', 'brooklyn', inplace=True)
df['city'].replace('broooklyn', 'brooklyn', inplace=True)
df['city'].replace('brroklyn', 'brooklyn', inplace=True)
df['city'].replace('brooklyln', 'brooklyn', inplace=True)
df['city'].replace('brooklyn', 'brooklyn', inplace=True)
df['city'].replace('brooklyn,_ny', 'brooklyn', inplace=True)
df['city'].replace('brooklyn,ny', 'brooklyn', inplace=True)
df['city'].replace('brooklyn._n.y.', 'brooklyn', inplace=True)
df['city'].replace('brooklyn1', 'brooklyn', inplace=True)
df['city'].replace('brooklyn_-_new_york', 'brooklyn', inplace=True)
df['city'].replace('brooklyn_new_york', 'brooklyn', inplace=True)
df['city'].replace('brooklyn_ny', 'brooklyn', inplace=True)
df['city'].replace('brooklynn', 'brooklyn', inplace=True)
df['city'].replace('beooklyn', 'brooklyn', inplace=True)
df['city'].replace('bk', 'brooklyn', inplace=True)
df['city'].replace('bklyn', 'brooklyn', inplace=True)
df['city'].replace('bkyn', 'brooklyn', inplace=True)
df['city'].replace('booklyn', 'brooklyn', inplace=True)
df['city'].replace('briooklyn', 'brooklyn', inplace=True)
df['city'].replace('brokklyn', 'brooklyn', inplace=True)
df['city'].replace('brook;yn', 'brooklyn', inplace=True)
df['city'].replace('broklyn', 'brooklyn', inplace=True)
df['city'].replace('brooklkyn', 'brooklyn', inplace=True)
df['city'].replace('brookllyn', 'brooklyn', inplace=True)
df['city'].replace('brookln', 'brooklyn', inplace=True)
df['city'].replace('brooklny', 'brooklyn', inplace=True)
df['city'].replace('brookltn', 'brooklyn', inplace=True)
df['city'].replace('brooklun', 'brooklyn', inplace=True)
df['city'].replace('brookly', 'brooklyn', inplace=True)
df['city'].replace('brooklyhn', 'brooklyn', inplace=True)
df['city'].replace('brooklyjn', 'brooklyn', inplace=True)
df['city'].replace('hastings_on_hudson,', 'hastings-on-hudson', inplace=True)
df['city'].replace('new_york_city,', 'new_york', inplace=True)
df['city'].replace('ny,', 'new_york', inplace=True)
df['city'].replace('nyc,', 'new_york', inplace=True)
df['city'].replace('san_fran,', 'san_francisco', inplace=True)
df['city'].replace('sf,', 'san_francisco', inplace=True)
df['city'].replace('la,', 'los_angeles', inplace=True)

# investigated "il" and found that it was a wrong address (and associated with a wrong zip)
df['city'].replace('il', 'brooklyn', inplace=True)

# investigated "test" and found that it was test data
df = pd.DataFrame(df.loc[df['city'] != "test"])

In [174]:
# Checking state needs for cleanup

tlist = df.state.unique()
print len(tlist)
print sorted(tlist)

# looks good


32
[nan, 'AB', 'ACT', 'AL', 'AZ', 'CA', 'CO', 'CT', 'DC', 'FL', 'GA', 'IL', 'LA', 'MA', 'MD', 'MH', 'NC', 'NE', 'NH', 'NJ', 'NSW', 'NY', 'ON', 'OR', 'PA', 'QC', 'RI', 'TX', 'VA', 'VIC', 'WA', 'WI']


In [175]:
# Checking country needs for cleanup

tlist = df.country.unique()
print len(tlist)
print sorted(tlist)

# we're not concerned about interational members so I'm going to cut the rest from the data

df = pd.DataFrame(df.loc[df['country'] == "UNITED STATES"])

df.pop('country');

16
['AUSTRALIA', 'BELGIUM', 'CANADA', 'FRANCE', 'INDIA', 'ISRAEL', 'ITALY', 'JAPAN', 'NETHERLANDS', 'NORWAY', 'SPAIN', 'SWEDEN', 'TAIWAN', 'THAILAND', 'UNITED KINGDOM', 'UNITED STATES']


In [176]:
# Checking zip needs for cleanup

# taking out +4 numbers from zips
df.zip = df.zip.str.split('-').str[0]

# tlist = df.zip.unique()
# print len(tlist)
# print sorted(tlist)


In [177]:
# okay, so how do we address the zips with only 4 numbers and the ones that have letters

# checked the zips with only 4 numbers against the addresses. 
# These aren't typos, the system isn't recognizing leading 0s

df['zip'] = df['zip'].apply(lambda x: '{0:0>5}'.format(x))

mask = (df['zip'].str.len() < 5)
dfz = df.loc[mask]

df['zip'] = df['zip'].apply(lambda x: '{0:0>5}'.format(x))

# one foreign zip had a NY address so fixing that
df['zip'].replace('RG5 4TT', '11215', inplace=True)

# here is the zip that was in chicago but should be in brooklyn
df['zip'].replace('60615', '11215', inplace=True)


# tlist = df.zip.unique()
# print len(tlist)
# print sorted(tlist)

In [178]:
# Checking joined needs for cleanup


# 210 na. problematic since a big part of the project is about when folks joined. could be all sorts of reasons. drop.
df['joined'].fillna('NA', inplace=True)
df = pd.DataFrame(df.loc[df['joined'] != "NA"])


# changing date to datetime

df.joined = pd.to_datetime(df.joined, format='%m/%d/%y')
    
# print (df.joined.dt.date.tolist())
    
# tlist = df.joined.unique()

# print len(tlist)
# print sorted(tlist)


# joindate = pd.Series(df.joined)

# print max(joindate)
# print min(joindate)

# # looks good. 
# df.head()

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15352 entries, 14 to 109
Data columns (total 29 columns):
mem_no                15352 non-null object
address               15352 non-null object
city                  15352 non-null object
state                 15352 non-null object
zip                   15352 non-null object
joined                15352 non-null datetime64[ns]
exp_date              15352 non-null object
status                15352 non-null object
mem_type              15352 non-null object
is_primary            15352 non-null object
last_renewal_date     8022 non-null object
mem_lvl               15352 non-null object
gender                15352 non-null object
club_email            15352 non-null object
newsletter            15352 non-null object
prm_mem_nmbr          15352 non-null object
vis                   383 non-null float64
dup                   10169 non-null object
parent_status         14914 non-null object
kid_count             14917 non-null object
kid1_b

In [180]:
# Checking exp_date needs for cleanup

# changing date to datetime

df.exp_date = pd.to_datetime(df.exp_date, format='%m/%d/%y');
    
# print (df.exp_date.dt.date.tolist());
    

df['exp_date'].replace('2040-06-15T00:00:00.000000000', '2004-06-15T00:00:00.000000000', inplace=True);
#
# tlist = df.exp_date.unique()
# print len(tlist)
# print sorted(tlist)

fdate = pd.Series(df.exp_date);

# print max(fdate)
# print min(fdate)

# all good



In [181]:
# Checking status needs for cleanup

tlist = df.status.unique()
print len(tlist)
print sorted(tlist)

# all good. will dummy later

5
['Active', 'Expired', 'Frozen', 'Pending', 'Prospective']


In [182]:
# Checking mem_type needs for cleanup

tlist = df.mem_type.unique()
print len(tlist)
print sorted(tlist)

# all good. will dummy later

7
['1 year membership ($40)', '2 Year Membership ($75)', '3 year membership ($110)', '5 year membership ($175)', 'Complimentary', 'Lifetime Member', 'Trial Membership']


In [183]:
# Checking is_primary needs for cleanup

tlist = df.is_primary.unique()
print len(tlist)
print sorted(tlist)

# all secondaries removed, popping

df.pop('is_primary');

2
['no', 'yes']


In [184]:
# Checking last_renewal_date needs for cleanup

# tlist = df.last_renewal_date.unique()
# print len(tlist)
# print sorted(tlist)

# this one is problematic. There are over 14K na, the majority of members. 

# Ah! I'll see if I can fillna with the join date on the theory that renewing ~ newing. 

# alternatively I could dummy this into "has renewed" versus "has not renewed." That's problematic because it 
# reinforces the problem that lifetime members would never renew and long term members may just not have needed to.

# changing date to datetime
pd.to_datetime(df.last_renewal_date)
df.last_renewal_date = pd.to_datetime(df.last_renewal_date, format='%m/%d/%y')


In [185]:
# i think I'm going to have go to with plan a

df.last_renewal_date.fillna(df.last_renewal_date, inplace=True)


tlist = df.last_renewal_date.unique()
print len(tlist)
# print sorted(tlist)

# changing date to datetime

# df.last_renewal_date = pd.to_datetime(df.joined, format='%m/%d/%y')


fdate = pd.Series(df.last_renewal_date)

print max(fdate)
print min(fdate)

# looks good

2046
NaT
NaT


In [186]:
# Checking mem_lvl needs for cleanup

tlist = df.mem_lvl.unique()
print len(tlist)
print sorted(tlist)

# looks good. will dummy later

# secondary is causing some trouble down the line and isn't a target for any of the questions. Might be interesting
# at some point down the line, but right now easier just to cut them.

df = pd.DataFrame(df.loc[df['mem_lvl'] != "Secondary"])

tlist = df.mem_lvl.unique()
print len(tlist)
print sorted(tlist)

# can now pop this column

df.pop('mem_lvl');

2
['Primary', 'Secondary']
1
['Primary']


In [187]:
# Checking gender needs for cleanup

tlist = df.gender.unique()
print len(tlist)
print sorted(tlist)

# looks good. will dummy later


2
['Female', 'Male']


In [188]:
# Checking club_email needs for cleanup

tlist = df.club_email.unique()
print len(tlist)
print sorted(tlist)

# looks good. will dummy later

2
['No', 'Yes']


In [189]:
# Checking newsletter needs for cleanup

tlist = df.newsletter.unique()
print len(tlist)
print sorted(tlist)

# no nulls and the only answer is 'no' so I can safely remove

df.pop('newsletter');

1
['No']


In [190]:
# Checking prm_mem_nmbr needs for cleanup

# tlist = df.prm_mem_nmbr.unique()
# print len(tlist)
# print sorted(tlist)

# looks good. since I dropped secondary members I can drop this column.

df.pop('prm_mem_nmbr');

In [191]:
# Checking vis needs for cleanup

tlist = df.vis.unique()
print len(tlist)
print sorted(tlist)

# all nans, drop

df.pop('vis');

1
[nan]


In [192]:
# Checking dup needs for cleanup

tlist = df.dup.unique()
print len(tlist)
print sorted(tlist)

df['dup'].fillna('NO', inplace=True)

# question is whether this is someone coming back to the program so I'm going to assume nan
# = 'NO'. Will change and dummy later

tlist = df.dup.unique()
print len(tlist)
print sorted(tlist)

3
[nan, 'NO', 'YES']
2
['NO', 'YES']


In [193]:
# Checking parent_status needs for cleanup

tlist = df.parent_status.unique()
print len(tlist)
print sorted(tlist)

6
[nan, '\r\rNo, but we are pregnant/adopting', 'No', 'No\r\r', 'No, but we are pregnant/adopting', 'Yes']


In [194]:
# okay, those are some weird answers and I can't find an \r\r so I'm just going to change those to 'No.' 
# Later on I'll dummy

df['parent_status'].replace('\r\rNo', 'No', inplace=True)
df['parent_status'].replace('No\r\r', 'No', inplace=True)
df['parent_status'].replace('\r\rNo, but we are pregnant/adopting', 'No', inplace=True)

df['parent_status'].fillna('Yes', inplace=True)

tlist = df.parent_status.unique()
print len(tlist)
print sorted(tlist)


# There are 866 nans in here. Too many to ignore. According to SF these are secondary parents so I will change to Yes.
# will dummy later

3
['No', 'No, but we are pregnant/adopting', 'Yes']


In [195]:
# Checking kid_count needs for cleanup

# more of those r\rs. Will "translate" to what they are without the \r\r. 
# Will also condense the "not quite one" options to a "NQY". Will have to change that to numeric for future use (.5?)

# 22 members with over 4 kids. I will hazard a 6 (wtf not?) REVISED: so few in 4 or 5+ that I merged

tlist = df.kid_count.unique()
print len(tlist)
print sorted(tlist)

# drop na (50)

df['kid_count'].fillna('NA', inplace=True)
df = pd.DataFrame(df.loc[df['kid_count'] != "NA"])


df['kid_count'].replace('\r\r1', '1', inplace=True)
df['kid_count'].replace('\r\r2', '2', inplace=True)
df['kid_count'].replace('\r\rNot quite one yet (pregnant or adopting)', 'NQY', inplace=True)
df['kid_count'].replace('\r\rNot quite one yet (pregnant!)', 'NQY', inplace=True)
df['kid_count'].replace('Not quite one yet (pregnant or adopting)', 'NQY', inplace=True)
df['kid_count'].replace('more than 4', '4', inplace=True)
df['kid_count'].replace('NQY', '.5', inplace=True)

df['kid_count'] = df['kid_count'].apply(pd.to_numeric)

tlist = df.kid_count.unique()
print len(tlist)
print sorted(tlist)

df['kid_count'].value_counts()


12
[nan, '\r\r1', '\r\r2', '\r\rNot quite one yet (pregnant or adopting)', '\r\rNot quite one yet (pregnant!)', '0', '1', '2', '3', '4', 'Not quite one yet (pregnant or adopting)', 'more than 4']
6
[0.0, 0.5, 1.0, 2.0, 3.0, 4.0]


1.0    7048
2.0    4917
0.5    1879
3.0     758
0.0     202
4.0     110
Name: kid_count, dtype: int64

In [196]:

df['kid1_bday'].fillna('NA', inplace=True)
df = pd.DataFrame(df.loc[df['kid1_bday'] != "NA"])


df['kid1_bday'].replace('03/28/0004', '3/28/2004', inplace=True)
df['kid1_bday'].replace('04/10/1696', '4/10/1996', inplace=True)
df['kid1_bday'].replace('09/13/0203', '9/13/2003', inplace=True)
df['kid1_bday'].replace('10/18/0211', '10/18/2011', inplace=True)
df['kid1_bday'].replace('06/15/2040', '6/15/2004', inplace=True)

# # # changing date to datetime

for k in df.kid1_bday:
    try:
        datetime.strptime(k, '%m/%d/%y')
    except:
        print k
# try:
# df.kid1_bday = pd.to_datetime(df.kid1_bday, format='%m/%d/%y')

    
# print (df.joined.dt.date.tolist())
    
tlist = df.kid1_bday.unique()

print len(tlist)
# print sorted(tlist)


joindate = pd.Series(df.kid1_bday)

print max(joindate)
print min(joindate)

# # looks good. 
# df.head()

5592
9/9/99
1/1/00


In [197]:
# Checking kid2_bday needs for cleanup

tlist = df.kid2_bday.unique()
print len(tlist)
# print sorted(tlist)

# changing date to datetime
pd.to_datetime(df.last_renewal_date)

df.last_renewal_date = pd.to_datetime(df.joined, format='%m/%d/%y')

# # changing nan here to = first kid's birthday so that we don't have to worry too much about them later
df.kid2_bday.fillna(df.kid1_bday, inplace=True)

fdate = pd.Series(df.kid2_bday)

tlist = df.kid1_bday.unique()
print len(tlist)
# print sorted(tlist)


print max(fdate)
print min(fdate)

fdate.sort_values(inplace=True)
fdate

3579
5592
9/9/99
1/1/01


10535    1/1/01
1887     1/1/01
475      1/1/01
10589    1/1/01
7973     1/1/01
10790    1/1/01
10697    1/1/01
9108     1/1/01
10816    1/1/01
11199    1/1/01
9921     1/1/01
1917     1/1/01
5419     1/1/01
1862     1/1/01
8308     1/1/01
10755    1/1/01
1689     1/1/01
2370     1/1/01
1068     1/1/01
2488     1/1/01
2413     1/1/01
599      1/1/01
2427     1/1/01
1818     1/1/01
7485     1/1/01
1472     1/1/01
7483     1/1/01
2109     1/1/01
115      1/1/01
6390     1/1/01
          ...  
5254     9/9/06
1022     9/9/08
4846     9/9/08
8145     9/9/09
10658    9/9/10
15144    9/9/10
8750     9/9/10
4840     9/9/10
13080    9/9/11
12692    9/9/12
8236     9/9/12
4647     9/9/12
10641    9/9/13
13406    9/9/13
14222    9/9/14
13654    9/9/14
13725    9/9/14
943      9/9/14
13577    9/9/14
12979    9/9/14
5553     9/9/15
3038     9/9/15
5087     9/9/15
6989     9/9/15
15122    9/9/15
5981     9/9/15
6838     9/9/16
12524    9/9/16
14037    9/9/16
13534    9/9/99
Name: kid2_bday, dtype: 

In [198]:
# Checking baby_group needs for cleanup

# tlist = df.baby_group.unique()
# print len(tlist)
# print sorted(tlist)

# changing date to datetime
pd.to_datetime(df.last_renewal_date)

# another field that might be interesting for something, but for now I don't know that it brings clarity to the
# questions at hand

df.pop('baby_group');

In [199]:
# Checking join_reason needs for cleanup

# this is an open response field so I'll keep it in here for now but will make separate doc for numerical analysis. 
# will do nlp with it later in project

# join reason snuck in and made all sorts of suckiness with its rampant line breaks!

df['join_reason'] = df['join_reason'].str.replace('\r', '')

tlist = df.join_reason.unique()
print len(tlist)
print sorted(tlist)

8792
[nan, ' I would like to connect with parent of teenagers, explore job opportunities and summer activities for younger teens.', '!', '#NAME?', "'If there are emails that yell 'DC can answer this!' I'll send those to ya.'", '-', "- first-time parents- sister-in-law recommended it- we have no idea what we're doing- I'm going to be home for a few months and am terrified, etc.", '- joining for baby group for my 8 mth old son, resources if needed at some point for my 4 yo son- would love to sell a few baby items not using any more- really interested in the business networking aspect, about to start my own consulting business - ', '--', '-----', '.', '. My husband and I are Canadians that moved from LA to NYC a few yrs ago.', '...', '05/30/197602/16/1985Birthdate requirement changed', '1) Item listings and 2) advice!', '1) To use other PS small business and community services2) I would like to run a ad for my family photography business3( I would like to hook up with mom baby groups to s

In [200]:
# Checking advice_grp needs for cleanup

# tlist = df.advice_grp.unique()
# print len(tlist)
# print sorted(tlist)

# it looks like this is a field where you enter an email to join the group. I'll change this to a yes/no
# yes = 1, no = 2

def adv(x):
    if type(x) == str:
        return 1
    else:
        return 0
    
df['advice_grp'] = df['advice_grp'].apply(adv);

# dfc.pop('cluster')
tlist = df.advice_grp.unique()
print len(tlist)
print sorted(tlist)

2
[0, 1]


In [201]:
# Checking classifieds needs for cleanup

# tlist = df.classifieds.unique()
# print len(tlist)
# print sorted(tlist)

# same as above
# it looks like this is a field where you enter an email to join the group. I'll change this to a yes/no
# yes = 1, no = 2

def adv(x):
    if type(x) == str:
        return 1
    else:
        return 0
    
df['classifieds'] = df['classifieds'].apply(adv);

# dfc.pop('cluster')
tlist = df.classifieds.unique()
print len(tlist)
print sorted(tlist)

2
[0, 1]


In [202]:
# Checking classifieds_spouse needs for cleanup

# tlist = df.classifieds_spouse.unique()
# print len(tlist)
# print sorted(tlist)

# doubt we'll get much from this, but since it's the same code as the prior two might as well go with it.

# it looks like this is a field where you enter an email to join the group. I'll change this to a yes/no
# yes = 1, no = 2

def adv(x):
    if type(x) == str:
        return 1
    else:
        return 0
    
df['classifieds_spouse'] = df['classifieds_spouse'].apply(adv);

# dfc.pop('cluster')
tlist = df.classifieds_spouse.unique()
print len(tlist)
print sorted(tlist)

2
[0, 1]


In [203]:
# Checking tony_kids needs for cleanup

tlist = df.tony_kids.unique()
print len(tlist)
print sorted(tlist)

# will assume nan = no

df['tony_kids'].fillna('No', inplace=True)

3
[nan, 'No', 'Yes']


In [204]:
# Checking discovered needs for cleanup

# tlist = df.discovered.unique()
# print len(tlist)
# print sorted(tlist)

fdate = pd.Series(df.discovered)

fdate

# will change nan to 'NA', take the \r\rs out. Later should be able to dummy

df['discovered'].fillna('NA', inplace=True)

df['discovered'].replace('\r\rFound it through a Google search', 'Found it through a Google search', inplace=True)
df['discovered'].replace('\r\rHeard about it on another online parenting group (Urban Baby, etc.)',\
                         'Heard about it on another online parenting group (Urban Baby, etc.)', inplace=True)
df['discovered'].replace('\r\rOther', 'Other', inplace=True)

tlist = df.discovered.unique()
print len(tlist)
print sorted(tlist)

9
["A PSP member I don't know told me about it", 'A PSP member who is a friend/neighbor', 'Found it through Yahoo', 'Found it through a Google search', 'Heard about it on another online parenting group (Urban Baby, etc.)', 'Heard about it through a magazine, newspaper, blog', "I don't remember", 'NA', 'Other']


In [205]:
df.head()

Unnamed: 0,mem_no,address,city,state,zip,joined,exp_date,status,mem_type,last_renewal_date,...,parent_status,kid_count,kid1_bday,kid2_bday,join_reason,advice_grp,classifieds,classifieds_spouse,tony_kids,discovered
14,2,438_12th_street,brooklyn,NY,11215,2009-02-16,2020-02-15,Active,Lifetime Member,2009-02-16,...,Yes,2.0,4/14/85,4/14/85,dfs,1,1,1,Yes,A PSP member who is a friend/neighbor
169,4,580_5th_street,brooklyn,NY,11215,2009-04-13,2020-04-12,Active,Lifetime Member,2009-04-13,...,Yes,2.0,12/11/02,12/11/02,,1,1,0,Yes,A PSP member I don't know told me about it
239,101,502_13th_st,brooklyn,NY,11215,2002-07-17,2014-09-15,Expired,1 year membership ($40),2002-07-17,...,Yes,2.0,1/1/01,1/1/01,no,1,1,0,No,Other
831,118,1512_10th_ave,brooklyn,NY,11215,2002-08-13,2017-06-15,Active,1 year membership ($40),2002-08-13,...,Yes,2.0,5/17/02,9/14/04,This is a renewal,1,1,0,No,A PSP member who is a friend/neighbor
350,121,434_13th_st,brooklyn,NY,11215,2002-08-26,2019-06-16,Active,3 year membership ($110),2002-08-26,...,Yes,2.0,10/5/01,7/31/07,"Yes, I'm in love with you, Susan Fox! :-)",1,1,0,Yes,A PSP member who is a friend/neighbor


In [206]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14914 entries, 14 to 109
Data columns (total 23 columns):
mem_no                14914 non-null object
address               14914 non-null object
city                  14914 non-null object
state                 14914 non-null object
zip                   14914 non-null object
joined                14914 non-null datetime64[ns]
exp_date              14914 non-null datetime64[ns]
status                14914 non-null object
mem_type              14914 non-null object
last_renewal_date     14914 non-null datetime64[ns]
gender                14914 non-null object
club_email            14914 non-null object
dup                   14914 non-null object
parent_status         14914 non-null object
kid_count             14914 non-null float64
kid1_bday             14914 non-null object
kid2_bday             14914 non-null object
join_reason           10144 non-null object
advice_grp            14914 non-null int64
classifieds           14914 non-

In [207]:
df.isnull().sum()

mem_no                   0
address                  0
city                     0
state                    0
zip                      0
joined                   0
exp_date                 0
status                   0
mem_type                 0
last_renewal_date        0
gender                   0
club_email               0
dup                      0
parent_status            0
kid_count                0
kid1_bday                0
kid2_bday                0
join_reason           4770
advice_grp               0
classifieds              0
classifieds_spouse       0
tony_kids                0
discovered               0
dtype: int64

In [208]:
df.to_csv("../../projects/psp/raw_data/PSP_data_4capstone.csv", index=False, date_format='%Y-%m-%d')

In [209]:
# df_in = pd.read_csv('../../projects/psp/raw_data/PSP_data_4capstone.csv')

In [210]:
# df_in.head()