# Capstone Project

## Park Slope Parents Membership

For this portion of the project I'm going to be making a classification model to determine whether someone is likely to be a long term or short term member.

The first step is figuring out where that line is. 

Median membership is 2 years, which is also close to the threshold I saw was when many folks check out relative to their child's birth (22 months). This seems like the ideal choice.

I'll also be doing a cluster analysis, because I love those.

In [11]:
# Load libraries

from datetime import datetime, date, timedelta

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import HTML
from matplotlib import cm as cm
from mpl_toolkits.mplot3d import Axes3D
from pandas.io import sql
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sqlalchemy import create_engine
import csv
import numpy as np
import pandas as pd 
import patsy
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.tools import FigureFactory as FF
from plotly.graph_objs import graph_objs
import psycopg2 as psy
import scipy
import seaborn as sns
import sklearn
py.sign_in('ajbentley', 'zjfu2vasav')

In [12]:
# read in data

dfn = pd.read_csv("../../projects/psp/raw_data/PSP_data_4capstone.csv")
dfn.info()
dfn.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14914 entries, 0 to 14913
Data columns (total 23 columns):
mem_no                14914 non-null object
address               14914 non-null object
city                  14914 non-null object
state                 14914 non-null object
zip                   14914 non-null int64
joined                14914 non-null object
exp_date              14914 non-null object
status                14914 non-null object
mem_type              14914 non-null object
last_renewal_date     14914 non-null object
gender                14914 non-null object
club_email            14914 non-null object
dup                   14914 non-null object
parent_status         14914 non-null object
kid_count             14914 non-null float64
kid1_bday             14914 non-null object
kid2_bday             14914 non-null object
join_reason           10144 non-null object
advice_grp            14914 non-null int64
classifieds           14914 non-null int64
classifieds_s

Unnamed: 0,mem_no,address,city,state,zip,joined,exp_date,status,mem_type,last_renewal_date,...,parent_status,kid_count,kid1_bday,kid2_bday,join_reason,advice_grp,classifieds,classifieds_spouse,tony_kids,discovered
0,2,438_12th_street,brooklyn,NY,11215,2009-02-16,2020-02-15,Active,Lifetime Member,2009-02-16,...,Yes,2.0,4/14/85,4/14/85,dfs,1,1,1,Yes,A PSP member who is a friend/neighbor
1,4,580_5th_street,brooklyn,NY,11215,2009-04-13,2020-04-12,Active,Lifetime Member,2009-04-13,...,Yes,2.0,12/11/02,12/11/02,,1,1,0,Yes,A PSP member I don't know told me about it
2,101,502_13th_st,brooklyn,NY,11215,2002-07-17,2014-09-15,Expired,1 year membership ($40),2002-07-17,...,Yes,2.0,1/1/01,1/1/01,no,1,1,0,No,Other
3,118,1512_10th_ave,brooklyn,NY,11215,2002-08-13,2017-06-15,Active,1 year membership ($40),2002-08-13,...,Yes,2.0,5/17/02,9/14/04,This is a renewal,1,1,0,No,A PSP member who is a friend/neighbor
4,121,434_13th_st,brooklyn,NY,11215,2002-08-26,2019-06-16,Active,3 year membership ($110),2002-08-26,...,Yes,2.0,10/5/01,7/31/07,"Yes, I'm in love with you, Susan Fox! :-)",1,1,0,Yes,A PSP member who is a friend/neighbor


In [13]:
# # not all types carried over from when the data was saved (datetime in particular, which
# # pleases me not at all).

dfn.joined = pd.to_datetime(dfn.joined, format='%Y/%m/%d')
dfn.exp_date = pd.to_datetime(dfn.exp_date, format='%Y/%m/%d')
dfn.last_renewal_date = pd.to_datetime(dfn.last_renewal_date, format='%Y/%m/%d')
dfn.kid1_bday = pd.to_datetime(dfn.kid1_bday, format='%m/%d/%y')
dfn.kid2_bday = pd.to_datetime(dfn.kid2_bday, format='%m/%d/%y')

In [14]:
# a few changes off the bat

# setting member number as index
dfn.set_index('mem_no')

# getting rid of address

dfn.drop(["address",'join_reason'], axis=1, inplace=True)

In [15]:
# make a few dummies
# patsy would be easier--relearn

status_dummy = pd.get_dummies(dfn['status'], prefix='status')
memtype_dummy = pd.get_dummies(dfn['mem_type'], prefix='mem_type')
email_dummy = pd.get_dummies(dfn['club_email'], prefix='club_email')
dup_dummy = pd.get_dummies(dfn['dup'], prefix='dup')
parent_dummy = pd.get_dummies(dfn['parent_status'], prefix='parent_status')

advice_dummy = pd.get_dummies(dfn['advice_grp'], prefix='advice_grp')
classifieds_dummy = pd.get_dummies(dfn['classifieds'], prefix='classifieds')
class_sp_dummy = pd.get_dummies(dfn['classifieds_spouse'], prefix='classifieds_spouse')
tony_dummy = pd.get_dummies(dfn['tony_kids'], prefix='tony_dids')
disc_dummy = pd.get_dummies(dfn['discovered'], prefix='discovered')


dfn = dfn.join(status_dummy)
dfn = dfn.join(memtype_dummy)
dfn = dfn.join(email_dummy)
dfn = dfn.join(dup_dummy)
dfn = dfn.join(parent_dummy)
dfn = dfn.join(advice_dummy)
dfn = dfn.join(classifieds_dummy)
dfn = dfn.join(class_sp_dummy)
dfn = dfn.join(tony_dummy)
dfn = dfn.join(disc_dummy)

In [16]:
# columns for joined month and year

dfn['join_year'] = dfn['joined'].dt.year
dfn['join_month'] = dfn['joined'].dt.month

# columns for exp_date year

dfn['exp_year'] = dfn['exp_date'].dt.year

# # columns for 1st kid's birth month and year
dfn['k1bday_year'] = dfn['kid1_bday'].dt.year
dfn['k1bday_month'] = dfn['kid1_bday'].dt.month

# # columns for 2nd kid's birth month and year
dfn['k2bday_year'] = dfn['kid2_bday'].dt.year
dfn['k2bday_month'] = dfn['kid2_bday'].dt.month



dfn.head()

Unnamed: 0,mem_no,city,state,zip,joined,exp_date,status,mem_type,last_renewal_date,gender,...,"discovered_Heard about it through a magazine, newspaper, blog",discovered_I don't remember,discovered_Other,join_year,join_month,exp_year,k1bday_year,k1bday_month,k2bday_year,k2bday_month
0,2,brooklyn,NY,11215,2009-02-16,2020-02-15,Active,Lifetime Member,2009-02-16,Female,...,0.0,0.0,0.0,2009,2,2020,1985,4,1985,4
1,4,brooklyn,NY,11215,2009-04-13,2020-04-12,Active,Lifetime Member,2009-04-13,Female,...,0.0,0.0,0.0,2009,4,2020,2002,12,2002,12
2,101,brooklyn,NY,11215,2002-07-17,2014-09-15,Expired,1 year membership ($40),2002-07-17,Female,...,0.0,0.0,1.0,2002,7,2014,2001,1,2001,1
3,118,brooklyn,NY,11215,2002-08-13,2017-06-15,Active,1 year membership ($40),2002-08-13,Female,...,0.0,0.0,0.0,2002,8,2017,2002,5,2004,9
4,121,brooklyn,NY,11215,2002-08-26,2019-06-16,Active,3 year membership ($110),2002-08-26,Female,...,0.0,0.0,0.0,2002,8,2019,2001,10,2007,7


In [17]:
# check dates for out of consideration range. basically for grandparents who are using their children's birth dates,
# not their grandchildren's. Org started in 2002 so will assume anything prior to 1990 will be out of range

dfn = pd.DataFrame(dfn.loc[dfn['k1bday_year'] >= 1995])
dfn = pd.DataFrame(dfn.loc[dfn['k2bday_year'] >= 1995])
dfn = pd.DataFrame(dfn.loc[dfn['k1bday_year'] < 2018])
dfn = pd.DataFrame(dfn.loc[dfn['k2bday_year'] < 2018])

In [18]:
dfn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14591 entries, 1 to 14913
Data columns (total 63 columns):
mem_no                                                                            14591 non-null object
city                                                                              14591 non-null object
state                                                                             14591 non-null object
zip                                                                               14591 non-null int64
joined                                                                            14591 non-null datetime64[ns]
exp_date                                                                          14591 non-null datetime64[ns]
status                                                                            14591 non-null object
mem_type                                                                          14591 non-null object
last_renewal_date                           

In [19]:
# enumerate some columns that had been dummied so that they come up on histograms
# patsy would speed this up

dfn['mem_type'].replace('1 year membership ($40)', 0, inplace=True)
dfn['mem_type'].replace('2 Year Membership ($75)', 1, inplace=True)
dfn['mem_type'].replace('3 year membership ($110)', 2, inplace=True)
dfn['mem_type'].replace('5 year membership ($175)', 3, inplace=True)
dfn['mem_type'].replace('Complimentary', 4, inplace=True)
dfn['mem_type'].replace('Lifetime Member', 5, inplace=True)
dfn['mem_type'].replace('Trial Membership', 6, inplace=True)

dfn['parent_status'].replace('No', 0, inplace=True)
dfn['parent_status'].replace('No, but we are pregnant/adopting', 1, inplace=True)
dfn['parent_status'].replace('Yes', 2, inplace=True)

dfn['discovered'].replace('A PSP member I don\'t know told me about it', 0, inplace=True)
dfn['discovered'].replace('A PSP member who is a friend/neighbor', 1, inplace=True)
dfn['discovered'].replace('Found it through Yahoo', 2, inplace=True)
dfn['discovered'].replace('Found it through a Google search', 3, inplace=True)
dfn['discovered'].replace('Heard about it on another online parenting group (Urban Baby, etc.)', 4, inplace=True)
dfn['discovered'].replace('Heard about it through a magazine, newspaper, blog', 5, inplace=True)
dfn['discovered'].replace('I don\'t remember', 6, inplace=True)
dfn['discovered'].replace('NA', 5, inplace=True)
dfn['discovered'].replace('Other', 6, inplace=True)



In [None]:
dfn.info()

In [None]:
# Create labels for if a member is over or under 1 year
def bucket(x):
    if x['average_salary'] > 95:
        return 1
    else:
        return 0