# Imports

In [28]:
import numpy as np
import pandas as pd
import os
from dateutil.relativedelta import relativedelta
import datetime

In [29]:
os.chdir('../data')
!ls

ukb43673.csv	     ukb43673_72cols_6mo.csv
ukb43673_72cols.csv  ukb_response_impute_ref_stats_group_ds3.csv


# load data with filtered features

from `data_col_filter_whole.ipynb`

In [30]:
df_og = pd.read_csv('ukb43673_72cols.csv')

In [31]:
print(df_og.columns)

Index(['Unnamed: 0', '53-2.0', '1920-2.0', '1930-2.0', '1940-2.0', '1950-2.0',
       '1960-2.0', '1970-2.0', '1980-2.0', '1990-2.0', '2000-2.0', '2010-2.0',
       '2020-2.0', '2030-2.0', '2050-2.0', '2060-2.0', '2070-2.0', '2080-2.0',
       '20400-0.0', '20505-0.0', '20506-0.0', '20507-0.0', '20508-0.0',
       '20509-0.0', '20510-0.0', '20511-0.0', '20512-0.0', '20513-0.0',
       '20514-0.0', '20515-0.0', '20516-0.0', '20517-0.0', '20518-0.0',
       '20519-0.0', '20520-0.0', '21003-2.0', '21023-0.0', '21024-0.0',
       '21028-0.0', '21029-0.0', '21030-0.0', '21031-0.0', '21032-0.0',
       '21033-0.0', '21034-0.0', '21035-0.0', '21036-0.0', '21037-0.0',
       '21038-0.0', '21039-0.0', '21040-0.0', '21041-0.0', '21042-0.0',
       '21043-0.0', '21044-0.0', '21045-0.0', '21047-0.0', '21048-0.0',
       '21049-0.0', '21051-0.0', '21052-0.0', '21053-0.0', '21054-0.0',
       '21055-0.0', '21056-0.0', '21058-0.0', '21059-0.0', '21060-0.0',
       '21061-0.0', '21064-0.0', '21065-0.0

In [32]:
df_og = df_og.drop(['Unnamed: 0'], axis=1)
    # I forgot to save the dataframe without the indices, so they got saved as a column
print(df_og.shape)

(502493, 72)


In [33]:
df_og

Unnamed: 0,53-2.0,1920-2.0,1930-2.0,1940-2.0,1950-2.0,1960-2.0,1970-2.0,1980-2.0,1990-2.0,2000-2.0,...,21055-0.0,21056-0.0,21058-0.0,21059-0.0,21060-0.0,21061-0.0,21064-0.0,21065-0.0,21068-0.0,eid
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,-600.0,-600.0,-600.0,-600.0,-601.0,-600.0,0.0,0.0,0.0,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,-600.0,-600.0,-600.0,-600.0,-601.0,-601.0,-121.0,-121.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502488,,,,,,,,,,,...,-601.0,-602.0,-601.0,-600.0,-602.0,-601.0,0.0,0.0,0.0,6025017.0
502489,2020-03-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,6025022.0
502490,,,,,,,,,,,...,-601.0,-600.0,-600.0,-600.0,-600.0,-600.0,0.0,0.0,0.0,6025036.0
502491,,,,,,,,,,,...,,,,,,,,,,6025045.0


In [34]:
df_og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502493 entries, 0 to 502492
Data columns (total 72 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   53-2.0     49001 non-null   object 
 1   1920-2.0   48642 non-null   float64
 2   1930-2.0   48642 non-null   float64
 3   1940-2.0   48642 non-null   float64
 4   1950-2.0   48642 non-null   float64
 5   1960-2.0   48642 non-null   float64
 6   1970-2.0   48642 non-null   float64
 7   1980-2.0   48642 non-null   float64
 8   1990-2.0   48642 non-null   float64
 9   2000-2.0   48642 non-null   float64
 10  2010-2.0   48642 non-null   float64
 11  2020-2.0   48642 non-null   float64
 12  2030-2.0   48642 non-null   float64
 13  2050-2.0   48642 non-null   float64
 14  2060-2.0   48642 non-null   float64
 15  2070-2.0   48642 non-null   float64
 16  2080-2.0   48642 non-null   float64
 17  20400-0.0  157371 non-null  object 
 18  20505-0.0  157340 non-null  float64
 19  20506-0.0  157340 non-n

In [35]:
df_og.isnull().sum()

53-2.0       453492
1920-2.0     453851
1930-2.0     453851
1940-2.0     453851
1950-2.0     453851
              ...  
21061-0.0    327734
21064-0.0    327734
21065-0.0    327734
21068-0.0    327734
eid            2000
Length: 72, dtype: int64

In [36]:
# get the date within 6 months

df_og['53-2.0'] = pd.to_datetime(df_og['53-2.0'])
    # date of attending assessment center
df_og['20400-0.0'] = pd.to_datetime(df_og['20400-0.0'])
    # date of completing mental health questionnaire
df_og['21023-0.0'] = pd.to_datetime(df_og['21023-0.0'])
    # date of completing digestive health questionnair

In [37]:
df_og['53-2.0']

0               NaT
1               NaT
2               NaT
3               NaT
4               NaT
            ...    
502488          NaT
502489   2020-03-10
502490          NaT
502491          NaT
502492          NaT
Name: 53-2.0, Length: 502493, dtype: datetime64[ns]

In [38]:
df_og['20400-0.0']

0               NaT
1               NaT
2        2016-08-26
3               NaT
4               NaT
            ...    
502488   2016-10-10
502489          NaT
502490   2016-08-22
502491          NaT
502492          NaT
Name: 20400-0.0, Length: 502493, dtype: datetime64[ns]

In [39]:
df_og['21023-0.0']
    # so this datetime object included the times, unlike the above

0                        NaT
1                        NaT
2        2017-05-08 19:11:53
3                        NaT
4        2017-12-05 18:37:16
                 ...        
502488   2017-04-19 08:07:06
502489                   NaT
502490   2017-05-03 08:55:26
502491                   NaT
502492                   NaT
Name: 21023-0.0, Length: 502493, dtype: datetime64[ns]

In [40]:
# drop the time component and only keep the day component
df_og['21023-0.0'] = pd.to_datetime(df_og['21023-0.0'].apply(lambda x: x.date()))

In [41]:
df_og['21023-0.0']

0               NaT
1               NaT
2        2017-05-08
3               NaT
4        2017-12-05
            ...    
502488   2017-04-19
502489          NaT
502490   2017-05-03
502491          NaT
502492          NaT
Name: 21023-0.0, Length: 502493, dtype: datetime64[ns]

In [42]:
# grab the series that is the difference of the two dates
# and grab only those that are less than 183 days (6 mos)
(df_og['53-2.0'] - df_og['20400-0.0'])[df_og['53-2.0'] - df_og['20400-0.0'] < pd.Timedelta(183,'D')]
    # but negative days larger than 6 mos were not filtered out

26       -838 days
28       -205 days
195      -756 days
310       -85 days
365       155 days
            ...   
502328   -555 days
502338   -597 days
502345   -310 days
502468     20 days
502481   -402 days
Length: 10751, dtype: timedelta64[ns]

In [43]:
# create 2 columns for both time lapse for both questionnaire
df_og['time_lapse1'] = df_og['53-2.0'] - df_og['20400-0.0']
df_og['time_lapse2'] = df_og['53-2.0'] - df_og['21023-0.0']
df_og

Unnamed: 0,53-2.0,1920-2.0,1930-2.0,1940-2.0,1950-2.0,1960-2.0,1970-2.0,1980-2.0,1990-2.0,2000-2.0,...,21058-0.0,21059-0.0,21060-0.0,21061-0.0,21064-0.0,21065-0.0,21068-0.0,eid,time_lapse1,time_lapse2
0,NaT,,,,,,,,,,...,,,,,,,,,NaT,NaT
1,NaT,,,,,,,,,,...,,,,,,,,,NaT,NaT
2,NaT,,,,,,,,,,...,-600.0,-600.0,-601.0,-600.0,0.0,0.0,0.0,,NaT,NaT
3,NaT,,,,,,,,,,...,,,,,,,,,NaT,NaT
4,NaT,,,,,,,,,,...,-600.0,-600.0,-601.0,-601.0,-121.0,-121.0,0.0,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502488,NaT,,,,,,,,,,...,-601.0,-600.0,-602.0,-601.0,0.0,0.0,0.0,6025017.0,NaT,NaT
502489,2020-03-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,6025022.0,NaT,NaT
502490,NaT,,,,,,,,,,...,-600.0,-600.0,-600.0,-600.0,0.0,0.0,0.0,6025036.0,NaT,NaT
502491,NaT,,,,,,,,,,...,,,,,,,,6025045.0,NaT,NaT


In [44]:
# get first lapse -183<...<183
df_og_6 = df_og[df_og.time_lapse1 < pd.Timedelta(183,'D')].copy()
df_og_6 = df_og_6[df_og_6.time_lapse1 > pd.Timedelta(-183,'D')]

# get second lapse -183<...<183
df_og_6 = df_og_6[df_og_6.time_lapse2 < pd.Timedelta(183,'D')]
df_og_6 = df_og_6[df_og_6.time_lapse2 > pd.Timedelta(-183,'D')]

df_og_6

Unnamed: 0,53-2.0,1920-2.0,1930-2.0,1940-2.0,1950-2.0,1960-2.0,1970-2.0,1980-2.0,1990-2.0,2000-2.0,...,21058-0.0,21059-0.0,21060-0.0,21061-0.0,21064-0.0,21065-0.0,21068-0.0,eid,time_lapse1,time_lapse2
365,2017-02-15,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-600.0,-600.0,-600.0,-601.0,0.0,-121.0,0.0,,155 days,-85 days
430,2017-09-15,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,-600.0,-601.0,-602.0,-601.0,0.0,1.0,0.0,,143 days,143 days
443,2016-10-30,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-600.0,-600.0,-601.0,-601.0,0.0,1.0,0.0,,59 days,-178 days
1092,2017-05-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-601.0,-600.0,-600.0,-601.0,0.0,0.0,0.0,,103 days,5 days
1641,2017-07-30,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-600.0,-600.0,-602.0,-601.0,0.0,-121.0,0.0,,17 days,79 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501111,2016-11-22,1.0,1.0,0.0,1.0,-1.0,1.0,1.0,0.0,0.0,...,-601.0,-600.0,-601.0,-602.0,1.0,1.0,0.0,6011244.0,95 days,-163 days
501354,2016-12-06,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,...,-602.0,-600.0,-601.0,-600.0,0.0,0.0,0.0,6013674.0,106 days,-162 days
502002,2016-12-21,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,...,-600.0,-600.0,-600.0,-601.0,0.0,0.0,0.0,6020156.0,121 days,-137 days
502065,2017-01-18,1.0,0.0,-1.0,1.0,1.0,-1.0,1.0,0.0,-1.0,...,-600.0,-600.0,-601.0,-600.0,-121.0,0.0,0.0,6020782.0,68 days,-94 days


In [46]:
df_og_6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1493 entries, 365 to 502230
Data columns (total 74 columns):
 #   Column       Non-Null Count  Dtype          
---  ------       --------------  -----          
 0   53-2.0       1493 non-null   datetime64[ns] 
 1   1920-2.0     1485 non-null   float64        
 2   1930-2.0     1485 non-null   float64        
 3   1940-2.0     1485 non-null   float64        
 4   1950-2.0     1485 non-null   float64        
 5   1960-2.0     1485 non-null   float64        
 6   1970-2.0     1485 non-null   float64        
 7   1980-2.0     1485 non-null   float64        
 8   1990-2.0     1485 non-null   float64        
 9   2000-2.0     1485 non-null   float64        
 10  2010-2.0     1485 non-null   float64        
 11  2020-2.0     1485 non-null   float64        
 12  2030-2.0     1485 non-null   float64        
 13  2050-2.0     1485 non-null   float64        
 14  2060-2.0     1485 non-null   float64        
 15  2070-2.0     1485 non-null   float

In [47]:
df_og_6.isnull().sum()

53-2.0         0
1920-2.0       8
1930-2.0       8
1940-2.0       8
1950-2.0       8
              ..
21065-0.0      0
21068-0.0      0
eid            5
time_lapse1    0
time_lapse2    0
Length: 74, dtype: int64

In [48]:
df_og_6.to_csv('ukb43673_74cols_6mo.csv', index = False)