## Data Collection and Cleaning

In [1]:
#!pip install pyreadstat
import pandas as pd
import numpy as np

### Data Collection

In [2]:
# This survey was downloaded from https://www.pewresearch.org/internet/?post_type=dataset
# File path and file name to use
file_path = './data/'
file_name = 'ATP W56.sav'
df = pd.read_spss(file_path + file_name)
df.head(3)

Unnamed: 0,QKEY,INTERVIEW_START_W56,INTERVIEW_END_W56,DEVICE_TYPE_W56,SAMPLE_W56,FORM_W56,MARITAL_W56,MARITAL2_W56,RELATEVER_W56,CASUAL_W56,...,F_PARTYSUM_FINAL,F_INCOME,F_INCOME_RECODE,F_REG,REG_KP,F_IDEO,F_ACSWEB,F_VOLSUM,WEIGHT_W56_ATPONLY,WEIGHT_W56
0,25.0,13790640000.0,13790640000.0,Mobile,KP,Form 2,Married,,,,...,Dem/Lean Dem,"$150,000 or more","$75,000+",,Yes- registered to vote,Liberal,Accesses Internet by paying a cell phone compa...,Yes,,0.038493
1,26.0,13790640000.0,13790640000.0,Mobile,KP,Form 2,Never been married,"No, not in a committed romantic relationship","No, have never been in a committed romantic re...","No, not casually dating anyone",...,Dem/Lean Dem,"Less than $10,000","<$30,000",,Yes- registered to vote,Liberal,Accesses Internet by paying a cell phone compa...,No,,0.265996
2,27.0,13790640000.0,13790640000.0,Mobile,KP,Form 1,Never been married,"Yes, in a committed romantic relationship",,,...,Dem/Lean Dem,"Less than $10,000","<$30,000",,No- not registered to vote,Moderate,Does not access the Internet by paying a cell ...,No,,0.763675


In [3]:
# Check size of data
df.shape

(4860, 183)

### Data Cleaning
-Reviewed all questions in this survey and removed the ones that will not be used in this project.

In [4]:
# List of columns not to use in analysis
df.drop(columns = ['QKEY','INTERVIEW_START_W56','INTERVIEW_END_W56','DEVICE_TYPE_W56','SAMPLE_W56',
                   'BREAKUP_W56','POTENTIALDATE1.a_W56', 'POTENTIALDATE1.b_W56', 'POTENTIALDATE1.c_W56', 
                   'POTENTIALDATE1.d_W56', 'POTENTIALDATE1.e_W56', 'POTENTIALDATE2.a_W56', 'POTENTIALDATE2.b_W56',
                   'POTENTIALDATE2.c_W56', 'POTENTIALDATE2.d_W56','POTENTIALDATE3.a_W56', 'POTENTIALDATE3.b_W56', 
                   'POTENTIALDATE3.c_W56', 'POTENTIALDATE3.d_W56', 'FORM_W56','WEIGHT_W56_ATPONLY','WEIGHT_W56', 
                   'F_REG', 'REG_KP','F_IDEO','F_PARTY_FINAL', 'F_PARTYLN_FINAL', 'F_PARTYSUM_FINAL', 'F_CITIZEN', 
                   'F_BORN', 'FIRSTDATE.a_W56', 'FIRSTDATE.b_W56', 'FIRSTDATE.c_W56', 'DATEACCEPT.a_W56',
                   'DATEACCEPT.b_W56', 'DATEACCEPT.c_W56', 'DATEACCEPT.d_W56', 
                   'DATEACCEPT.e_W56', 'F_RELIG', 'F_ATTEND', 'F_ACSWEB','F_VOLSUM', 'F_INCOME_RECODE'], inplace=True)

-Divide questions by category so it is easier and quicker to plot based on category. These lists will be used for the first graphs using a for loop to plot quickly every feature. Once the relevant plots are identified, clean the code and remove unneccesary graphs.

In [5]:
on_relationship_status = ['MARITAL_W56', 'MARITAL2_W56', 'RELATEVER_W56', 'CASUAL_W56', 'SEEKING_W56',
                       'MOTIVNODATE.a_W56','MOTIVNODATE.b_W56', 'MOTIVNODATE.c_W56', 'MOTIVNODATE.d_W56', 
                       'MOTIVNODATE.e_W56', 'MOTIVNODATE.f_W56', 'MOTIVNODATE.g_W56', 'MOTIVNODATE.h_W56', 
                       'TOTRELDUR_W56', 'FAMSURV19RELAT_W56', 'FAMSURV19DATING_W56' ]

on_internet_use = ['SNSUSE_W56', 'SNSPOST_W56', 'ONLINEDATE_W56', 'ONLINEDATE2_W56', 'ONCHECK_W56', 
                   'BREAKUPACCEPTF1.a_W56', 'BREAKUPACCEPTF1.b_W56','BREAKUPACCEPTF1.c_W56', 
                   'BREAKUPACCEPTF1.d_W56','BREAKUPACCEPTF1.e_W56', 'BREAKUPACCEPTF2.a_W56','BREAKUPACCEPTF2.b_W56', 
                   'BREAKUPACCEPTF2.c_W56', 'BREAKUPACCEPTF2.d_W56', 'BREAKUPACCEPTF2.e_W56', 'PARTNERDEVICE.a_W56',
                   'PARTNERDEVICE.b_W56', 'PARTNERDEVICE.c_W56']


on_harassment = ['DATEHARASSM_W56', 'DATEHARASSW_W56', 'HARASSEXP1.a_W56', 'HARASSEXP1.b_W56', 'HARASSEXP1.c_W56', 
              'HARASSEXP2.a_W56', 'HARASSEXP2.b_W56', 'HARASSEXP2.c_W56', 'ONHARASS.a_W56', 'ONHARASS.b_W56', 
              'ONHARASS.c_W56', 'ONHARASS.d_W56']   

on_internet_opinion = ['ONIMPACT_W56', 'ONIMPACTPOSOE_M1_W56', 'ONIMPACTPOSOE_M2_W56', 'ONIMPACTPOSOE_M3_W56', 
                    'ONIMPACTNEGOE_M1_W56', 'ONIMPACTNEGOE_M2_W56', 'ONIMPACTNEGOE_M3_W56', 'ONSUCCESS_W56',
                    'ONSAFE_W56', 'ONPROBLEM.a_W56', 'ONPROBLEM.b_W56', 'ONPROBLEM.c_W56', 'ONPROBLEM.d_W56', 
                    'ONPROBLEM.e_W56', 'DATE10YR_W56','WHYDATE10YRHARDOE_M1_W56', 'WHYDATE10YRHARDOE_M2_W56', 
                       'WHYDATE10YRHARDOE_M3_W56', 'WHYDATE10YRHARD_TECH_W56','WHYDATE10YREASYOE_M1_W56', 
                       'WHYDATE10YREASYOE_M2_W56', 'WHYDATE10YREASYOE_M3_W56', 'WHYDATE10YREASY_TECH_W56', 
                       'ONDATA_W56', 'ONSAFE_W56']

individual_characs = ['PARTNERSEX_W56', 'ORIENTATIONMOD_W56']

on_privacy = ['PARTNERPASS.a_W56','PARTNERPASS.b_W56','PARTNERPASS.c_W56','PARTNERTRACK_W56','SNSCHECK.a_W56',
              'SNSCHECK.b_W56']
       
on_dating_preferences = ['ONPROFILES.a_W56', 'ONPROFILES.b_W56', 'ONPROFILES.c_W56', 
                          'ONPROFILES.d_W56', 'ONPROFILES.e_W56', 'ONPROFILES.f_W56', 'ONPROFILES.g_W56', 
                         'ONPROFILES.h_W56', 'ONPROFILES.i_W56']


on_online_experiences = ['ONFIND.a_W56', 'ONFIND.b_W56', 'ONFIND.c_W56', 'ONFIND.d_W56', 'DATEVOCAB.a_W56', 
                         'ONMSGF1_W56','ONMSGF2_W56', 'ONALGORITHM_W56', 'DATEVOCAB.b_W56', 'DATEVOCAB.c_W56', 
                         'DATEVOCAB.d_W56', 'DATEVOCAB.e_W56','DATEGHOST_W56', 'ONEXPGEN_W56', 'ONFEEL.a_W56', 
                         'ONFEEL.b_W56','ONFEEL.c_W56', 'ONEXP.a_W56', 'ONEXP.b_W56','ONEXP.c_W56']

on_how_meet = ['PARTNERMEET_W56', 'ONMEET_W56', 'DATEDIFF_W56','WHYDATEDIFF.a_W56', 'WHYDATEDIFF.b_W56', 
               'WHYDATEDIFF.c_W56', 'WHYDATEDIFF.d_W56', 'WHYDATEDIFF.e_W56', 'WHYDATEDIFF.f_W56']

on_feelings = ['SNSSEE1_W56', 'SNSSEE2_W56', 'SNSFEEL_W56', 'PARTNERDISTRACT_W56','PARTNERSCREEN.a_W56', 
               'PARTNERSCREEN.b_W56', 'PARTNERSCREEN.c_W56', 'DATEPRESSURE.a_W56', 'DATEPRESSURE.b_W56', 
               'DATEPRESSURE.c_W56', 'SNSPARTNER.a_W56','SNSPARTNER.b_W56']

In [6]:
# These are the columns with demographics info
demographics = [each for each in df.columns if each.startswith('F_')]
demographics

['F_METRO',
 'F_CREGION',
 'F_USR_SELFID',
 'F_AGECAT',
 'F_SEX',
 'F_EDUCCAT',
 'F_EDUCCAT2',
 'F_RACETHN',
 'F_NATIVITY',
 'F_MARITAL',
 'F_INCOME']

### Null Values
There are null values for the question columns. The reason for this is because not all questions were asked to all groups. For example, people who responded they were in a commited relationship, did not answer questions about dating. For the columns with demographic information, all data is complete. For the questions were the answer is "Refused", meaning the participant did not answer the question, those will be removed from the data when it is plotted during EDA. 

In [7]:
df[demographics].isnull().sum()

F_METRO         0
F_CREGION       0
F_USR_SELFID    0
F_AGECAT        0
F_SEX           0
F_EDUCCAT       0
F_EDUCCAT2      0
F_RACETHN       0
F_NATIVITY      0
F_MARITAL       0
F_INCOME        0
dtype: int64

Write csv file with cleaned dataset

In [8]:
df.to_csv(file_path + 'cleaned_PEW_W56.csv', index=False)