# Curate_V2_WeatherFeatures_data

In [1]:
%matplotlib inline

import datetime as dt
import itertools as it
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import pandas as pd
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
from tabulate import tabulate

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

Welcome, Abhishek Pratap!



In [2]:
v2sid = 'syn17095978'

In [None]:
v2r = pd.read_csv(
    syn.get(v2sid).path, 
    parse_dates=['date']
)

leading = ['dt_passive', 'lat', 'lon', 'zipcode']

v2 = v2r.rename(columns={
    'date':'dt_passive',
    'temp_med':'temp_median'
})
v2 = v2.reindex(labels=
    leading + sorted(list(set(v2.columns)-set(leading))),
    axis=1                  
)

v2.zipcode = v2.zipcode.apply(lambda x: str(int(x)) if not pd.isnull(x) else 'none')

# reduce timestamps to dates
v2.dt_passive = v2.dt_passive.apply(lambda x: x.date())

v2.head()

### Upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17061284')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Weather Features',
            columns=as_table_columns(v2), 
            parent='syn10848316'),
        v2
    )
)

In [3]:
final = syn.setProvenance(
    'syn17061284',
    activity=Activity(
        name='Generate Public Weather Features Table Data',
        description='Process the data collected during study ',
        used=[v2sid],
        executed=[
            dict(
                name='Curate_V2_WeatherFeatures_data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_V2_WeatherFeatures_data.ipynb'
            )
        ]
    )
)

### Generate data dictionary

In [None]:
cheat = []
for c in v2.columns:
    cheat.append(
        (c, str(v2[c].dtype)\
             .replace('object', 'str')\
             .replace('float64', 'float')\
             .replace('int64', 'int')\
             .replace('datetime64[ns]', 'DateTime')
        )
    )
    
headers = ['#', 'Variable Name', 'Data Type', 'Description']

descriptions = [
    'Date of aggregations',
    'Latitude in degree-decimal (DD) format',
    'Longitude in DD format',
    'Closest zipcode to latitude and longitude',
    'Interquartile range of hourly cloud cover recordings',
    'Mean of hourly cloud cover recordings',
    'Median of hourly cloud cover recordings',
    'Standard deviation of hourly cloud cover recordings',
    'Interquartile range of hourly dew point recordings',
    'Mean of hourly dew point recordings',
    'Median of hourly dew point recordings',
    'Standard Deviation of hourly dew point recordings',
    'Interquartile range of humidity recordings',
    'Mean of hourly humidity recordings',
    'Median of hourly humidity recordings',
    'Standard deviation of hourly humidity recordings',
    'Total precipitation recording for the day',
    'Interquartile range of temperature recordings',
    'Mean of temperature recordings',
    'Median of temperature recordings',
    'Standard deviation of temperature recordings'
]

cheat = pd.DataFrame(
    cheat, 
    columns=headers[1:-1],
    index=list(range(1, len(cheat)+1))
)
cheat['Description'] = descriptions

print(tabulate(
    cheat,
    headers=headers,
    tablefmt='orgtbl'
).replace('+', '|'))