In [1]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

## Import all the data (csv files)

In [2]:
# read in all the csvs
# stopped at wv2 realizing that ww 2-6 don't have caseid_new (need to go back to ETL to fix)

wv1_pp = pd.read_csv("wv1_pp.csv")
# wv1_pp = pd.read_csv("wv1_pp.csv", index_col=0)

# read in wave 1 main question data
wv1_mq = pd.read_csv("wv1_mq.csv")
# wv1_mq = pd.read_csv("wv1_mq.csv", index_col=0)

## Join tables and drop columns

In [3]:
# join wv1 demographic info and main questions info, inner join 
wv1_all_raw = pd.merge(wv1_pp,wv1_mq,on="caseid_new", how="inner")
print(wv1_pp.shape)
print(wv1_mq.shape)
print(wv1_all_raw.shape)

(4002, 51)
(3009, 189)
(3009, 239)


In [4]:
# inspect columns
print('wave 1 all columns: ', wv1_all_raw.columns)
print('\n')


## read in the columns to keep from a txt file (with a list of all the relevant columns)
cls_to_keep = pd.read_csv("wv1_columns_to_keep.txt",header=None,delimiter="\n")
cls_to_keep = cls_to_keep[0].tolist()
print('columns to keep: ',cls_to_keep)
print('\n')

## drop other columns
wv1_all = wv1_all_raw[cls_to_keep]
print(wv1_all.shape) # 3009 rows
print('\n')
print(wv1_all.columns)

wave 1 all columns:  Index(['caseid_new', 'weight1', 'weight2', 'ppage', 'ppagecat', 'ppagect4',
       'ppeduc', 'ppeducat', 'ppethm', 'ppgender',
       ...
       'married', 'parental_approval', 'respondent_yrsed', 'partner_yrsed',
       'home_country_recode', 'US_raised', 'partner_mom_yrsed',
       'respondent_mom_yrsed', 'relationship_quality', 'coresident'],
      dtype='object', length=239)


columns to keep:  ['caseid_new', 'ppagecat', 'ppmarit', 'children_in_hh', 'papevangelical', 'papreligion', 'qflag', 's1', 'married', 's2', 'gender_attraction', 'q5', 'same_sex_couple', 'partner_race', 'respondent_race', 'q7b', 'partner_religion_reclassified', 'age_difference', 'q18a_1', 'q18a_2', 'q18a_3', 'q18a_refused', 'marrynotreally', 'civilnotreally', 'how_long_relationship', 'q24_met_online', 'q24_school', 'q24_college', 'q24_military', 'q24_church', 'q24_vol_org', 'q24_customer', 'q24_bar_restaurant', 'q24_public', 'q24_private_party', 'q24_blind_date', 'q24_vacation', 'q24_single

## Generate clean variables for api - X variables

### qualify flag 

In [5]:
# create an empty list to host all the cleaned columns 
api_cls = ["caseid_new"]

# inspect qualify flag
wv1_all['qflag'].value_counts()  # partnered (3009)

wv1_all = wv1_all.rename(columns={"qflag":"qflag_w1"})

api_cls.append('qflag_w1')
print(api_cls)
print("\n")
print(wv1_all.columns)
print(wv1_all.shape)

['caseid_new', 'qflag_w1']


Index(['caseid_new', 'ppagecat', 'ppmarit', 'children_in_hh', 'papevangelical',
       'papreligion', 'qflag_w1', 's1', 'married', 's2', 'gender_attraction',
       'q5', 'same_sex_couple', 'partner_race', 'respondent_race', 'q7b',
       'partner_religion_reclassified', 'age_difference', 'q18a_1', 'q18a_2',
       'q18a_3', 'q18a_refused', 'marrynotreally', 'civilnotreally',
       'how_long_relationship', 'q24_met_online', 'q24_school', 'q24_college',
       'q24_military', 'q24_church', 'q24_vol_org', 'q24_customer',
       'q24_bar_restaurant', 'q24_public', 'q24_private_party',
       'q24_blind_date', 'q24_vacation', 'q24_singles_service_non_internet',
       'q24_business_trip', 'q24_work_neighbor', 'met_through_friends',
       'met_through_family', 'met_through_as_neighbors',
       'met_through_as_coworkers', 'q31_1', 'q31_2', 'q31_3', 'q31_4', 'q31_5',
       'q31_6', 'q31_7', 'q31_8', 'q31_9', 'q31_other_text_entered', 'q32',
       'q32_interne

### respondent age

In [6]:
# ppagecat
wv1_all['ppagecat'].value_counts()
# 35-44    679
# 45-54    601
# 25-34    560
# 55-64    511
# 65-74    261
# 18-24    228
# 75+      169

35-44    679
45-54    601
25-34    560
55-64    511
65-74    261
18-24    228
75+      169
Name: ppagecat, dtype: int64

### marrital status

In [7]:
# inspect all marrital status columns and pick one to add to api_columns

wv1_all['ppmarit'].value_counts() # married, living with partner, never married, divorced, widowned, seperated
wv1_all['s1'].value_counts() # yes i am married (1928), no i'm not married (1081)
wv1_all['s2'].value_counts() # i have a boyfriend/ girlfriend (975), i have a romantic partner who is not yet a sexual partner (106)
wv1_all['married'].value_counts() # married (1928), not married (1081)

api_cls.append('married')
print(api_cls)

['caseid_new', 'qflag_w1', 'married']


### children; age gap

In [8]:
# inspect children in household and add to api_columns
wv1_all['children_in_hh'].value_counts() # 0 (2195 respondents),1,2,3,4,5,6,7
api_cls.append('children_in_hh')
print(api_cls)
print("\n")

# inspect age gap and add to api_columns
wv1_all['age_difference'].value_counts() # 1 to 70 years, mean = 4.7, 50th percentile = 3
print(wv1_all['age_difference'].describe())
print("\n")

api_cls.append('age_difference')
print(api_cls)
print("\n")

### Bin the age gap 

bins = [-1,3,5,10,20,70]
group_names = ["0 to 3","4 to 5","6 to 10","11 to 20",">=20"]


wv1_all["age_gap_bin"] = pd.cut(wv1_all["age_difference"], bins, labels=group_names)
print("\n")
print(wv1_all["age_gap_bin"].value_counts())

api_cls.append('age_gap_bin')
print(api_cls)
print("\n")

['caseid_new', 'qflag_w1', 'married', 'children_in_hh']


count    2990.000000
mean        4.733445
std         5.432778
min         0.000000
25%         1.000000
50%         3.000000
75%         6.000000
max        70.000000
Name: age_difference, dtype: float64


['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference']




0 to 3      1633
6 to 10      548
4 to 5       469
11 to 20     281
>=20          59
Name: age_gap_bin, dtype: int64
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin']




### sexuality

In [9]:
# inspect sexuality and pick one

wv1_all['gender_attraction'].value_counts() 
# oppsite gender only (2281), only same gender (358), mostly opposite (161), same gender mostly (111), both gender equally (84)
wv1_all['q5'].value_counts() # yes we are a same-sex couple (464), no we are an opposite-sex couple (214), refused (2)
wv1_all['same_sex_couple'].value_counts() # different sex couple (2535), same-sex couple (474)
api_cls.append('same_sex_couple')
print(api_cls)

['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple']


###  race and religion

In [10]:
# same race or not 
print(wv1_all['partner_race'].value_counts())
print('\n')
print(wv1_all['respondent_race'].value_counts())

# initialization
wv1_all['race_gap'] = 1000

for i in range(len(wv1_all.index)):

    if wv1_all['partner_race'][i] == wv1_all['respondent_race'][i]:
        wv1_all['race_gap'][i] = 0
    else:
        wv1_all['race_gap'][i] = 1

        
print(wv1_all['race_gap'].value_counts())
print('\n')


api_cls.append('race_gap')
print(api_cls)

NH white                  2408
 NH black                  242
Hispanic                   235
 NH Asian Pac Islander      61
 NH Other                   33
 NH Amer Indian             22
Name: partner_race, dtype: int64


NH white                  2337
Hispanic                   315
 NH black                  234
 NH Asian Pac Islander      62
 NH Amer Indian             30
 NH Other                   28
Name: respondent_race, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


0    2486
1     523
Name: race_gap, dtype: int64


['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap']


In [11]:
# same religion or not
# q7b
# papreligion

# print(wv1_all['q7b'].value_counts())
# print('\n')
# print(wv1_all['papreligion'].value_counts())
# print('\n')

# change the "protestant" value to make them consistent (remove the , after "e.g.")

wv1_all['papreligion_2'] = wv1_all['papreligion'].replace("protestant (e.g., methodist, lutheran, presbyterian, episcopal)","protestant (e.g. methodist, lutheran, presbyterian, episcopal)")

print(wv1_all['q7b'].value_counts())
print('\n')
print(wv1_all['papreligion_2'].value_counts())
print('\n')

# initialization
wv1_all['religious_gap'] = 1000

for i in range(len(wv1_all.index)):

    if wv1_all['q7b'][i] == wv1_all['papreligion_2'][i]:
        wv1_all['religious_gap'][i] = 0
    else:
        wv1_all['religious_gap'][i] = 1

        
print(wv1_all['religious_gap'].value_counts())
print('\n')
# 1    1693
# 0    1316

catholic                                                          679
protestant (e.g. methodist, lutheran, presbyterian, episcopal)    663
none                                                              542
baptist - any denomination                                        421
other christian                                                   372
jewish                                                             83
pentecostal                                                        71
mormon                                                             65
other non-christian, please specify                                56
buddhist                                                           18
muslim                                                             13
eastern orthodox                                                   11
hindu                                                               8
refused                                                             7
Name: q7b, dtype: in

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1    1693
0    1316
Name: religious_gap, dtype: int64




In [12]:
api_cls.append('religious_gap')
print(api_cls)

['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap']


### education gap

In [13]:
# columns
# partner_yrsed
# respondent_yrsed

print(wv1_all['partner_yrsed'].value_counts())
print(wv1_all['partner_yrsed'].describe())
print("\n")

# mean       13.899867
# 50%        13.000000
# max        20.000000


print(wv1_all['respondent_yrsed'].value_counts())
print(wv1_all['respondent_yrsed'].describe())
# mean       13.852110
# 50%        13.000000
# max        20.000000

12.0    814
13.0    701
16.0    609
17.0    324
14.0    267
20.0    114
11.0     51
10.0     49
7.5      27
9.0      25
5.5      13
2.5       5
0.0       2
Name: partner_yrsed, dtype: int64
count    3001.000000
mean       13.899867
std         2.475491
min         0.000000
25%        12.000000
50%        13.000000
75%        16.000000
max        20.000000
Name: partner_yrsed, dtype: float64


12.0    836
16.0    659
13.0    644
17.0    287
14.0    225
20.0    125
11.0     77
10.0     61
9.0      44
7.5      41
5.5       7
2.5       2
0.0       1
Name: respondent_yrsed, dtype: int64
count    3009.000000
mean       13.852110
std         2.520441
min         0.000000
25%        12.000000
50%        13.000000
75%        16.000000
max        20.000000
Name: respondent_yrsed, dtype: float64


In [14]:
wv1_all["edu_gap"] = wv1_all['respondent_yrsed'] - wv1_all['partner_yrsed']
wv1_all["edu_gap"] = wv1_all["edu_gap"].abs()
print(wv1_all["edu_gap"].describe())

# mean        1.705432
# min         0.000000
# 25%         0.000000
# 50%         1.000000
# 75%         3.000000
# max        14.500000

api_cls.append('edu_gap')
print(api_cls)

count    3001.000000
mean        1.705432
std         1.945045
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        14.500000
Name: edu_gap, dtype: float64
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap']


In [15]:
# Bin the education gap

bins = [-1,2,4,6,15]
group_names = ["0 to 2","3 to 4","5 to 6",">6"]

wv1_all["edu_gap_bin"] = pd.cut(wv1_all["edu_gap"], bins, labels=group_names)
print("\n")
print(wv1_all["edu_gap_bin"].value_counts())

api_cls.append('edu_gap_bin')
print(api_cls)
print("\n")



0 to 2    2116
3 to 4     653
5 to 6     130
>6         102
Name: edu_gap_bin, dtype: int64
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin']




### Same political standing or not

In [16]:
## TBD

### Parental Approval

In [17]:
# parental_approval

print(wv1_all['parental_approval'].value_counts())
api_cls.append('parental_approval')
print(api_cls)
# approve                        1643
# don't approve or don't know     460

approve                        1643
don't approve or don't know     460
Name: parental_approval, dtype: int64
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval']


## Generate How Met variables

### 1) Met online

In [18]:
# inspect met_online columns
# q24_met_online
# q31_4
# q32
# q32_internet
# either_internet
# either_internet_adjusted

wv1_all['q24_met_online'].value_counts() 
# met offline (2664), met online (270)

wv1_all['q31_4'].value_counts() 
# did you met through personal ads dating services on line? no (2843), yes (150)

wv1_all['q32'].value_counts()
# no, we did not meet through the internet                                    2702
# yes, an internet dating or matchmaking site (like eharmony or match.com)      99
# yes, an internet chat room                                                    61
# yes, a different kind of internet service                                     61
# yes, a social networking site (like facebook or myspace)                      53
# yes, an internet classified advertising site (like craigslist)                20
# refused                                                                       13

wv1_all['q32_internet'].value_counts() 
# 0.0    2702
# 1.0     294

wv1_all['either_internet'].value_counts() 
# No     2690
# Yes     311

wv1_all['either_internet_adjusted'].value_counts() 
# not met online                                   2690
# met online                                        286
# probably not met online, q32 and q24 disagree      25

not met online                                   2690
met online                                        286
probably not met online, q32 and q24 disagree      25
Name: either_internet_adjusted, dtype: int64

In [19]:
# Convert "either_internet_adjusted" into a 0/1 binary variable and add to api_columns

wv1_all['met_online'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['either_internet_adjusted'][i]!='not met online':
        wv1_all['met_online'][i] = 1
    else:
        wv1_all['met_online'][i] = 0


print(wv1_all["either_internet_adjusted"].value_counts())
print(wv1_all["met_online"].value_counts())
print(wv1_all.shape)
        
api_cls.append('met_online')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


not met online                                   2690
met online                                        286
probably not met online, q32 and q24 disagree      25
Name: either_internet_adjusted, dtype: int64
0    2690
1     319
Name: met_online, dtype: int64
(3009, 86)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online']


### 2) Met at work

In [20]:
# inspect met at work columns
# q24_customer
# q24_work_neighbor
# met_through_as_coworkers
# q31_1

wv1_all['q24_customer'].value_counts() 
# No 2699, Yes 235

wv1_all['q24_work_neighbor'].value_counts() 
# No 2923, Yes 11

wv1_all['met_through_as_coworkers'].value_counts() 
# 0.0 = 2349, 1.0 = 495

wv1_all['q31_1'].value_counts() 
# no 2512, yes 481, refused 16


#### Generate the met_at_work flag if any of the 4 questions have a YES

wv1_all['met_at_work'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_customer'][i]=='Yes' or wv1_all['q24_work_neighbor'][i]=='Yes' or wv1_all['met_through_as_coworkers'][i]== 1.0 or wv1_all['q31_1'][i] == 'yes':
        wv1_all['met_at_work'][i] = 1
    else:
        wv1_all['met_at_work'][i] = 0

wv1_all['met_at_work'].value_counts() 
# 0    2255
# 1     754


### Check columns and append met_at_work flag to the list
print(wv1_all.shape)
api_cls.append('met_at_work')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(3009, 87)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work']


### 3) met at school

In [21]:
# Relevant Columns
# q24_school
# q24_college
# q31_2

wv1_all['q24_school'].value_counts()
# No     2642
# Yes     292

wv1_all['q24_college'].value_counts()
# No     2676
# Yes     258

wv1_all['q31_2'].value_counts()
# no         2564
# yes         429
# refused      16

no         2564
yes         429
refused      16
Name: q31_2, dtype: int64

In [22]:
#### Generate the met_at_school flag if any of the 3 questions have a YES

wv1_all['met_at_school'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_school'][i]=='Yes' or wv1_all['q24_college'][i]=='Yes' or wv1_all['q31_2'][i] == 'yes':
        wv1_all['met_at_school'][i] = 1
    else:
        wv1_all['met_at_school'][i] = 0

wv1_all['met_at_school'].value_counts() 
# 0    2423
# 1     586


## Check columns and append met_at_work flag to the list
print(wv1_all.shape)
        
api_cls.append('met_at_school')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(3009, 88)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school']


### 4) met at church

In [23]:
# Relevant Columns
# q24_church
# q31_3


wv1_all['q24_church'].value_counts()
# No     2740
# Yes     194

wv1_all['q31_3'].value_counts()
# no         2813
# yes         180
# refused      16

no         2813
yes         180
refused      16
Name: q31_3, dtype: int64

In [24]:
#### Generate the met_at_church flag if any of the questions have a YES

wv1_all['met_at_church'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_church'][i] == 'Yes' or wv1_all['q31_3'][i] == 'yes':
        wv1_all['met_at_church'][i] = 1
    else:
        wv1_all['met_at_church'][i] = 0

wv1_all['met_at_church'].value_counts() 
# 0    2781
# 1     228


# Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_at_church')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(3009, 89)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church']


### 5) Met while travel

In [25]:
# Columns
# q24_vacation
# q24_business_trip
# q31_5

wv1_all['q24_vacation'].value_counts()
# No     2863
# Yes      71

wv1_all['q24_business_trip'].value_counts()
# No     2905
# Yes      29

wv1_all['q31_5'].value_counts()
# met on vacation/business trip
# no         2952
# yes          41
# refused      16

no         2952
yes          41
refused      16
Name: q31_5, dtype: int64

In [26]:
#### Generate flag

wv1_all['met_travel'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_vacation'][i] == 'Yes' or wv1_all['q24_business_trip'][i] == 'Yes' or wv1_all['q31_5'][i] == 'yes':
        wv1_all['met_travel'][i] = 1
    else:
        wv1_all['met_travel'][i] = 0

wv1_all['met_travel'].value_counts() 
# 0    2895
# 1     114


# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_travel')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(3009, 90)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel']


### 6) met at social organization/gym/ health club/ volunteer service

In [27]:
# Columns
# q24_vol_org
# q31_7

wv1_all['q24_vol_org'].value_counts()
# No     2699
# Yes     235

wv1_all['q31_7'].value_counts()
# "met at social organization/health club/gym/volunteer-service activity"
# no         2852
# yes         141
# refused      16

no         2852
yes         141
refused      16
Name: q31_7, dtype: int64

In [28]:
#### Generate the met_social flag

wv1_all['met_social'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_vol_org'][i] == 'Yes' or wv1_all['q31_7'][i] == 'yes':
        wv1_all['met_social'][i] = 1
    else:
        wv1_all['met_social'][i] = 0

print(wv1_all['met_social'].value_counts())
# 0    2708
# 1     301

# Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_social')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0    2708
1     301
Name: met_social, dtype: int64
(3009, 91)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social']


### 7) Met at private party


In [29]:
## columns
# q24_private_party
# q31_8 met at private party

print(wv1_all['q24_private_party'].value_counts())
print("\n")
# No     2561
# Yes     373

print(wv1_all['q31_8'].value_counts())
print("\n")
# met at a privary party
# no         2658
# yes         335
# refused      16

#### Generate flag

wv1_all['met_party'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_private_party'][i] == 'Yes' or wv1_all['q31_8'][i] == 'yes':
        wv1_all['met_party'][i] = 1
    else:
        wv1_all['met_party'][i] = 0

print(wv1_all['met_party'].value_counts())
# 0    2503
# 1     506

# Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_party')
print(api_cls)

No     2561
Yes     373
Name: q24_private_party, dtype: int64


no         2658
yes         335
refused      16
Name: q31_8, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0    2503
1     506
Name: met_party, dtype: int64
(3009, 92)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party']


### 8) Met through friends and family

In [30]:
## columns
# q24_blind_date
# met_through_friends
# met_through_family
# q33_1
# q33_2
# q33_3
# q33_4
# q33_5


print(wv1_all['q24_blind_date'].value_counts())
print("\n")
# No     2840
# Yes      94

print(wv1_all['met_through_friends'].value_counts())
print("\n")
# not met through friends    1914
# meet through friends       1020

print(wv1_all['met_through_family'].value_counts())
print("\n")
# not met through family    2490
# met through family         444


print(wv1_all['q33_1'].value_counts())
print("\n")
# introduced by family 
# no         2712
# yes         279
# refused      18

print(wv1_all['q33_2'].value_counts())
print("\n")
# introduced by mutual friends
# no         2025
# yes         966
# refused      18


print(wv1_all['q33_3'].value_counts())
print("\n")
# introduced by co workers
# no         2760
# yes         231
# refused      18


print(wv1_all['q33_4'].value_counts())
print("\n")
# introduced by classmates
# no         2863
# yes         128
# refused      18

print(wv1_all['q33_5'].value_counts())
print("\n")

# introduced by neighbors
# no         2947
# yes          44
# refused      18

No     2840
Yes      94
Name: q24_blind_date, dtype: int64


not met through friends    1914
meet through friends       1020
Name: met_through_friends, dtype: int64


not met through family    2490
met through family         444
Name: met_through_family, dtype: int64


no         2712
yes         279
refused      18
Name: q33_1, dtype: int64


no         2025
yes         966
refused      18
Name: q33_2, dtype: int64


no         2760
yes         231
refused      18
Name: q33_3, dtype: int64


no         2863
yes         128
refused      18
Name: q33_4, dtype: int64


no         2947
yes          44
refused      18
Name: q33_5, dtype: int64




In [31]:
#### Generate flag

wv1_all['met_f_and_f'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['q24_blind_date'][i] == 'Yes' or wv1_all['met_through_friends'][i] == 'meet through friends' or wv1_all['met_through_family'][i] == 'met through family':
        wv1_all['met_f_and_f'][i] = 1
    else:
        wv1_all['met_f_and_f'][i] = 0

print(wv1_all['met_f_and_f'].value_counts())
print("\n")
# 0    1709
# 1    1300


# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_f_and_f')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0    1709
1    1300
Name: met_f_and_f, dtype: int64


(3009, 93)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f']


### 9) Met as neighbors

In [32]:
# met_through_as_neighbors

print(wv1_all['met_through_as_neighbors'].value_counts())
print("\n")
# did not meet through or as neighbors    2665
# met through or as neighbors              269

#### Generate flag

wv1_all['met_as_neighbors'] = 1000 # initialize

for i in range(len(wv1_all.index)):

    if wv1_all['met_through_as_neighbors'][i] == 'met through or as neighbors':
        wv1_all['met_as_neighbors'][i] = 1
    else:
        wv1_all['met_as_neighbors'][i] = 0

print(wv1_all['met_as_neighbors'].value_counts())
print("\n")
# 0    2740
# 1     269

# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_as_neighbors')
print(api_cls)

did not meet through or as neighbors    2665
met through or as neighbors              269
Name: met_through_as_neighbors, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


0    2740
1     269
Name: met_as_neighbors, dtype: int64


(3009, 94)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors']


### 10) Met at public space (bar, restaurant, dance club etc.)

In [33]:
# columns
# q24_bar_restaurant
# q24_public
# q31_6

print(wv1_all['q24_bar_restaurant'].value_counts())
print("\n")
# No     2368
# Yes     566

print(wv1_all['q24_public'].value_counts())
print("\n")
# met in public space
# No     2717
# Yes     217

print(wv1_all['q31_6'].value_counts())
print("\n")
# met at [bar/nightclub/dance club]
# no         2646
# yes         347
# refused      16

No     2368
Yes     566
Name: q24_bar_restaurant, dtype: int64


No     2717
Yes     217
Name: q24_public, dtype: int64


no         2646
yes         347
refused      16
Name: q31_6, dtype: int64




In [34]:
#### Generate flag

wv1_all['met_public_space'] = 1000 # initialize

for i in range(len(wv1_all.index)):
    if wv1_all['q24_bar_restaurant'][i] == 'Yes' or wv1_all['q24_public'][i] == 'Yes' or wv1_all['q31_6'][i] == 'yes':
        wv1_all['met_public_space'][i] = 1
    else:
        wv1_all['met_public_space'][i] = 0

print(wv1_all['met_public_space'].value_counts())
print("\n")
# 0    2190
# 1     819

# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_public_space')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


0    2190
1     819
Name: met_public_space, dtype: int64


(3009, 95)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors', 'met_public_space']


### 11) offline dating service

In [35]:
# q24_singles_service_non_internet

print(wv1_all['q24_singles_service_non_internet'].value_counts())
print("\n")
# No     2889
# Yes      45

#### Generate flag

wv1_all['met_offline_dating'] = 1000 # initialize

for i in range(len(wv1_all.index)):
    if wv1_all['q24_singles_service_non_internet'][i] == 'Yes':
        wv1_all['met_offline_dating'][i] = 1
    else:
        wv1_all['met_offline_dating'][i] = 0

print(wv1_all['met_offline_dating'].value_counts())
print("\n")
# No     2889
# Yes      45

# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_offline_dating')
print(api_cls)

No     2889
Yes      45
Name: q24_singles_service_non_internet, dtype: int64




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0    2964
1      45
Name: met_offline_dating, dtype: int64


(3009, 96)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors', 'met_public_space', 'met_offline_dating']


### 12) met other

In [36]:
# q24_military
# q31_9
# q31_other_text_entered
# q33_6
# q33_7
# q33_other_text_entered
# how_met_online


print(wv1_all['q24_military'].value_counts())
print("\n")
# No     2856
# Yes      78

print(wv1_all['q31_9'].value_counts())
print("\n")
# if other where
# no         1930
# yes        1063
# refused      16

print(wv1_all['q31_other_text_entered'].value_counts())
print("\n")
# No     1983
# Yes    1026

print(wv1_all['q33_6'].value_counts())
print("\n")
# introduced self or partner introduced self
# no         1781
# yes        1210
# refused      18

print(wv1_all['q33_7'].value_counts())
print("\n")
# introduced by [other]
# no         2687
# yes         304
# refused      18


print(wv1_all['q33_other_text_entered'].value_counts())
print("\n")
# No     2721
# Yes     288

print(wv1_all['how_met_online'].value_counts())
# Previously Strangers: Before online connection respondent and partner were strangers                                  243
# Probably Did Not meet partner online, despite positive answer to q32 or q24                                            25
# Mediated: Online connection was mediated by friends, family, or others                                                 23
# reconnected: already knew partner but reconnected online                                                               18
# We cannot tell from the existed data whether the respondent and partner knew each other prior to online connection      2

No     2856
Yes      78
Name: q24_military, dtype: int64


no         1930
yes        1063
refused      16
Name: q31_9, dtype: int64


No     1983
Yes    1026
Name: q31_other_text_entered, dtype: int64


no         1781
yes        1210
refused      18
Name: q33_6, dtype: int64


no         2687
yes         304
refused      18
Name: q33_7, dtype: int64


No     2721
Yes     288
Name: q33_other_text_entered, dtype: int64


Previously Strangers: Before online connection respondent and partner were strangers                                  243
Probably Did Not meet partner online, despite positive answer to q32 or q24                                            25
Mediated: Online connection was mediated by friends, family, or others                                                 23
reconnected: already knew partner but reconnected online                                                               18
We cannot tell from the existed data whether the respondent and partner knew each other

In [37]:
#### Generate flag

wv1_all['met_other'] = 1000 # initialize

for i in range(len(wv1_all.index)):
    if wv1_all['q24_military'][i] == 'Yes':
        wv1_all['met_other'][i] = 1
    else:
        wv1_all['met_other'][i] = 0

print(wv1_all['met_other'].value_counts())
print("\n")
# 0    2931
# 1      78

# # Check columns and append met_at_church flag to the list
print(wv1_all.shape)
api_cls.append('met_other')
print(api_cls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0    2931
1      78
Name: met_other, dtype: int64


(3009, 97)
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors', 'met_public_space', 'met_offline_dating', 'met_other']


## Generate clean variables for api - Y variables

In [38]:
# inspect relationship length and add to api cls list

print(wv1_all['how_long_relationship'].describe())
# count    2982.000000
# mean       17.707897
# std        15.661472
# min         0.000000
# 25%         5.000000
# 50%        13.000000
# 75%        26.000000
# max        76.000000

api_cls.append('how_long_relationship')
print(api_cls)

count    2982.000000
mean       17.707897
std        15.661472
min         0.000000
25%         5.000000
50%        13.000000
75%        26.000000
max        76.000000
Name: how_long_relationship, dtype: float64
['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors', 'met_public_space', 'met_offline_dating', 'met_other', 'how_long_relationship']


In [39]:
# inspect relationship quality and add to api cls list

wv1_all['q34'].value_counts() # excellent (1771), good (911), fair (252), poor(42), very poor (20), refused (13)
wv1_all['relationship_quality'].value_counts() # excellent (1771), good (911), fair (252), poor(42), very poor (20)

api_cls.append('relationship_quality')
print(api_cls)

['caseid_new', 'qflag_w1', 'married', 'children_in_hh', 'age_difference', 'age_gap_bin', 'same_sex_couple', 'race_gap', 'religious_gap', 'edu_gap', 'edu_gap_bin', 'parental_approval', 'met_online', 'met_at_work', 'met_at_school', 'met_at_church', 'met_travel', 'met_social', 'met_party', 'met_f_and_f', 'met_as_neighbors', 'met_public_space', 'met_offline_dating', 'met_other', 'how_long_relationship', 'relationship_quality']


## Create api_datatable dataframe

In [40]:
api_datatable_df = wv1_all[api_cls]
api_datatable_df

Unnamed: 0,caseid_new,qflag_w1,married,children_in_hh,age_difference,age_gap_bin,same_sex_couple,race_gap,religious_gap,edu_gap,...,met_travel,met_social,met_party,met_f_and_f,met_as_neighbors,met_public_space,met_offline_dating,met_other,how_long_relationship,relationship_quality
0,22526,partnered,not married,0,4.0,4 to 5,same-sex couple,1,1,2.0,...,0,0,0,0,0,1,0,0,7.00,good
1,23286,partnered,married,0,2.0,0 to 3,same-sex couple,0,1,1.0,...,0,0,0,1,1,0,0,0,8.00,good
2,26315,partnered,not married,0,9.0,6 to 10,same-sex couple,0,1,2.0,...,0,0,0,1,0,0,0,0,8.00,good
3,28536,partnered,not married,0,2.0,0 to 3,same-sex couple,0,1,1.0,...,0,0,0,0,0,1,0,0,12.00,good
4,29584,partnered,married,0,7.0,6 to 10,different sex couple,0,0,0.0,...,0,0,0,0,0,0,0,0,30.00,good
5,31456,partnered,not married,0,0.0,0 to 3,same-sex couple,0,1,3.0,...,0,0,0,0,0,0,0,0,4.00,excellent
6,32656,partnered,not married,0,0.0,0 to 3,same-sex couple,0,0,0.0,...,0,0,1,1,0,0,0,0,27.00,excellent
7,33536,partnered,not married,0,9.0,6 to 10,same-sex couple,1,0,3.0,...,0,0,0,0,0,1,0,0,15.00,excellent
8,34341,partnered,married,0,0.0,0 to 3,different sex couple,0,1,0.0,...,0,1,0,1,1,0,0,0,14.00,excellent
9,35653,partnered,not married,0,15.0,11 to 20,same-sex couple,0,1,4.0,...,0,0,0,0,0,0,0,0,14.00,excellent


In [41]:
api_datatable_df.to_csv("api_datatable_wv1.csv")