# Importing Libraries

In [1]:
#Preprocessing file, most recently used for london data

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from basic_imports import *

from download_data import *
from project_specific.philap_helpers import *
from project_specific.daphne_helpers import *
from project_specific.peeps_helpers import *
from math import radians, cos, sin, asin, sqrt
from pixelgrams import *
import folium
import datetime
from constants import *
from load_files import *
from scipy.stats import *
from scipy import spatial
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
import numpy as np

In [2]:
def load_mexico_airrespeck_file(filepath, project_name, timestamp_column_name='timestamp'):
    try:
        data = pd.read_csv(filepath, converters={'timestamp': str})
        data['timestamp'] = pd.to_datetime(data[timestamp_column_name]).dt.tz_convert(
            project_mapping[project_name][1])
        data = data.set_index('timestamp')
        data['timestamp'] = data.index
        data = data.replace('None', np.nan)
        return data
    except EmptyDataError:
        print("Skipped file because it is empty")
        return
    except IOError:
        print("Skipped file because it doesn't exist")
        return

# File loading Functions

In [3]:
def load_mexico_static_airspeck_file(sid_or_uuid, project_name=None, sensor_label=None, suffix_filename="",
                              upload_type='automatic',
                              subject_visit_number=None, calibrate_pm=False, calibrate_ox=False, calibrate_no2=False,
                              use_all_features_for_pm_calibration=False,
                              use_all_features_for_gas_calibration=True,
                              return_calibration_flag=False, calibration_id=None, filename=None,
                              country_name=None):
    assert upload_type in ['automatic', 'sd_card'], "upload_type has to be either 'automatic' or 'sd_card'"

    if project_name is None and len(sid_or_uuid) == 6:
        project_name = get_project_for_subject(sid_or_uuid)

    if sensor_label is None:
        if subject_visit_number is None:
            sensor_label = "{}".format(sid_or_uuid)
        else:
            sensor_label = "{}({})".format(sid_or_uuid, subject_visit_number)

    if filename is None:
        filename = "{}_calibrated.csv".format(sensor_label, suffix_filename)

    print("Loading file: {}".format(project_mapping[project_name][2] + filename))
    data = load_mexico_airrespeck_file(project_mapping[project_name][2] + filename, project_name)
    
    #Sdata = filter_out_outliers_gas(data)

    if calibrate_pm or calibrate_ox or calibrate_no2:
        result_date, was_calibrated_pm, was_calibrated_no2, was_calibrated_ox,  data = calibrate_airspeck(
            sid_or_uuid, data, calibrate_pm=calibrate_pm, calibrate_no2=calibrate_no2, 
            calibrate_ox=calibrate_ox,project_name=project_name, calibration_id=calibration_id,
            use_all_features_pm=use_all_features_for_pm_calibration,
            use_all_features_gas=use_all_features_for_gas_calibration, country_name=country_name)

        if return_calibration_flag:
            return result_date, was_calibrated_pm,  was_calibrated_no2, was_calibrated_ox, data

    return data

In [4]:
mypath = '/Users/azamkhan/speckled/london/static_data'

In [5]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [6]:
onlyfiles

['90E275086B4D99A3_calibrated.csv',
 '.DS_Store',
 'INX021_airspeckP_calibrated.csv',
 'D849BF7848210A4A_calibrated.csv',
 'E7C0CD8112BA98D7_calibrated.csv',
 '905801CA0E1F1D11_calibrated.csv']

In [7]:
files = ["90E275086B4D99A3", "E7C0CD8112BA98D7", "905801CA0E1F1D11", "D849BF7848210A4A"]

# Loading Static Files

In [8]:
CALIBRATE = False

#leon_logs = load_mexico_participant_details()
london_static_airspeck = pd.DataFrame()

for f in files:
    uuid = f  
  
    
    frame = load_mexico_static_airspeck_file(uuid, project_name="inhale", suffix_filename="",
                              upload_type='automatic',
                                calibrate_pm=CALIBRATE, calibrate_ox=CALIBRATE, calibrate_no2=CALIBRATE,
                              use_all_features_for_pm_calibration=False,
                              use_all_features_for_gas_calibration=True,
                              return_calibration_flag=False, calibration_id=None, country_name="London")
    
    if frame is not None:
        frame['UUID'] = uuid
    london_static_airspeck = london_static_airspeck.append(frame)

Loading file: /Users/azamkhan/speckled/london/static_data/90E275086B4D99A3_calibrated.csv
Loading file: /Users/azamkhan/speckled/london/static_data/E7C0CD8112BA98D7_calibrated.csv
Loading file: /Users/azamkhan/speckled/london/static_data/905801CA0E1F1D11_calibrated.csv
Loading file: /Users/azamkhan/speckled/london/static_data/D849BF7848210A4A_calibrated.csv


In [9]:
london_static_df = london_static_airspeck.filter(items=['timestamp','pm2_5','temperature', 'humidity','gpsLongitude', 'gpsLatitude','UUID'])

In [10]:
london_static_df

Unnamed: 0_level_0,timestamp,pm2_5,temperature,humidity,gpsLongitude,gpsLatitude,UUID
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-08-10 00:02:06+01:00,2020-08-10 00:02:06+01:00,3.831317,27.2,52.3,-0.178568,51.495163,90E275086B4D99A3
2020-08-10 00:08:09+01:00,2020-08-10 00:08:09+01:00,3.533033,27.3,52.0,0.000000,0.000000,90E275086B4D99A3
2020-08-10 00:12:09+01:00,2020-08-10 00:12:09+01:00,3.986756,26.9,53.1,-0.178963,51.495583,90E275086B4D99A3
2020-08-10 00:18:09+01:00,2020-08-10 00:18:09+01:00,3.960015,27.1,52.7,0.000000,0.000000,90E275086B4D99A3
2020-08-10 00:22:09+01:00,2020-08-10 00:22:09+01:00,3.914266,26.8,53.5,-0.178713,51.495480,90E275086B4D99A3
...,...,...,...,...,...,...,...
2020-08-10 23:43:34+01:00,2020-08-10 23:43:34+01:00,16.720176,26.3,55.5,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:47:34+01:00,2020-08-10 23:47:34+01:00,14.649888,26.3,55.2,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:51:34+01:00,2020-08-10 23:51:34+01:00,12.113481,26.3,55.0,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:55:34+01:00,2020-08-10 23:55:34+01:00,13.445276,26.2,55.1,-0.174716,51.501682,D849BF7848210A4A


# Preprocessing static files

In [49]:
def parse_dow_hod(dataset):
    #filDataset = dataset.filter(items = ['timestamp'])
    df = pd.DataFrame()
    df2 = pd.DataFrame()
    df['day_of_week'] = dataset['timestamp'].dt.day_name()
    df2['hour_of_day'] = dataset['timestamp'].dt.hour
    dataset = pd.concat([df2, df, dataset], axis =1)
    dataset = dataset.filter(items=['timestamp','pm2_5','hour_of_day','day_of_week', 'temperature', 'humidity' ,'gpsLongitude', 'gpsLatitude', 'UUID'])
    return dataset

In [50]:
london_s = parse_dow_hod(london_static_df)

In [57]:
london_s = london_s.drop("timestamp", axis=1)

In [58]:
london_s

Unnamed: 0_level_0,pm2_5,hour_of_day,day_of_week,temperature,humidity,gpsLongitude,gpsLatitude,UUID
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-08-10 00:02:06+01:00,3.831317,0,Monday,27.2,52.3,-0.178568,51.495163,90E275086B4D99A3
2020-08-10 00:08:09+01:00,3.533033,0,Monday,27.3,52.0,0.000000,0.000000,90E275086B4D99A3
2020-08-10 00:12:09+01:00,3.986756,0,Monday,26.9,53.1,-0.178963,51.495583,90E275086B4D99A3
2020-08-10 00:18:09+01:00,3.960015,0,Monday,27.1,52.7,0.000000,0.000000,90E275086B4D99A3
2020-08-10 00:22:09+01:00,3.914266,0,Monday,26.8,53.5,-0.178713,51.495480,90E275086B4D99A3
...,...,...,...,...,...,...,...,...
2020-08-10 23:43:34+01:00,16.720176,23,Monday,26.3,55.5,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:47:34+01:00,14.649888,23,Monday,26.3,55.2,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:51:34+01:00,12.113481,23,Monday,26.3,55.0,-0.174716,51.501682,D849BF7848210A4A
2020-08-10 23:55:34+01:00,13.445276,23,Monday,26.2,55.1,-0.174716,51.501682,D849BF7848210A4A


In [89]:
london_s.to_csv("london_stationary.csv")

In [51]:
london_s = pd.read_csv("london_stationary.csv")

## Mapping sensor locations

In [41]:
london_sensors = pd.read_csv("london_sensors.csv")

In [88]:

  
static_points=[]
   

for idx, rows in london_sensors.iterrows():
    sgps_points = [rows.gpsLatitude, rows.gpsLongitude]
    static_points.append(sgps_points)   
 
map_ = folium.Map(location=static_points[0])#, tiles="Stamen Toner")

for p in static_points:    
        folium.Marker(p, icon=folium.Icon(color='gray', icon='home', prefix='fa')).add_to(map_)
    
    
# for loc in pts:
    

#     folium.Circle(
#                 location=loc,

#                 fill=True,
#                 radius = 10,
#                 fill_opacity=1,
#                 color='red'
#             ).add_to(map_)


map_

# Personal Files

In [11]:
def load_mexico_personal_airspeck_file(subject_id, project_name=None, upload_type='automatic', is_minute_averaged=True,
                                subject_visit_number=None, suffix_filename="",
                                calibrate_pm_and_gas=False, use_all_features_for_pm_calibration=False,
                                use_all_features_for_gas_calibration=False, suppress_output=False,
                                set_below_zero_to=np.nan, return_calibration_flag=False, calibration_id=None,
                                filter_pm=True, country_name=None):
 
    if subject_visit_number is None:
        label_files = subject_id
    else:
        label_files = "{}({:.0f})".format(subject_id, int(subject_visit_number))

    if project_name is None:
        project_name = get_project_for_subject(subject_id)

    if is_minute_averaged:
        filename = "{}_{}_calibrated.csv".format(label_files, suffix_filename)
    else:
        filename = "{}_{}_calibrated_raw.csv".format(label_files, suffix_filename)

    print("Loading file: {}".format(project_mapping[project_name][2] + filename))
    data = load_mexico_airrespeck_file(project_mapping[project_name][2] + filename,
                                project_name)

    if calibrate_pm_and_gas:
        result_date, was_calibrated_pm, was_calibrated_no2, was_calibrated_ox, data = calibrate_airspeck(
            subject_id, data, project_name=project_name, calibrate_pm=True, calibrate_no2=False, 
                           calibrate_ox=False, calibration_id=calibration_id, 
                           use_all_features_pm=use_all_features_for_pm_calibration,
                           use_all_features_gas=use_all_features_for_gas_calibration, country_name=country_name)

    if filter_pm and data is not None and len(data) > 0:
        below_zero_mask = data['pm2_5'] <= 0

        if np.count_nonzero(below_zero_mask):
            if not suppress_output:
                print("Setting {} values equal to or below 0 to {}".format(np.count_nonzero(below_zero_mask),
                                                                           set_below_zero_to))
            data.loc[below_zero_mask, 'pm2_5'] = set_below_zero_to

        # Fix humidity values. Sometimes valid readings of humidity pass 100. Above 105, they are definitely invalid
        

    if calibrate_pm_and_gas and return_calibration_flag:
        return result_date, was_calibrated_pm, data
    else:
        return data

In [13]:
ldn_pers_airspeck = pd.DataFrame()

for f in personal_files:
    sid = f
    #date = str(row["Date"])[:10]
    
    suffix_filename = "airspeckP"

    frame = load_mexico_personal_airspeck_file(sid, project_name="inhale", suffix_filename=suffix_filename,
                              upload_type='automatic',is_minute_averaged=True)
    
    if frame is not None:
        frame['walk'] = sid
    ldn_pers_airspeck = ldn_pers_airspeck.append(frame)

Loading file: /Users/azamkhan/speckled/london/static_data/INX021_airspeckP_calibrated.csv


In [15]:
ldn_pers_airspeck['timestamp'] = ldn_pers_airspeck.timestamp.dt.round('S', 'NaT')
ldn_pers_airspeck = ldn_pers_airspeck.set_index('timestamp')

In [18]:
ldn_pers_airspeck = ldn_pers_airspeck.resample('min').mean()

In [21]:
ldn_pers_df = ldn_pers_airspeck.filter(items=['timestamp','pm2_5','temperature', 'humidity','gpsLongitude', 'gpsLatitude', 'walk'])

In [25]:
ldn_pers_df = ldn_pers_df.reset_index()

In [32]:
def parse_dow_hod(dataset):
    df = pd.DataFrame()
    df2 = pd.DataFrame()
    df['day_of_week'] = dataset['timestamp'].dt.day_name()
    df2['hour_of_day'] = dataset['timestamp'].dt.hour
    dataset = pd.concat([df2, df, dataset], axis =1)
    dataset = dataset.filter(items=['timestamp','pm2_5','hour_of_day','day_of_week', 'temperature', 'humidity' ,'gpsLongitude', 'gpsLatitude', 'UUID', 'walk'])
    return dataset

In [33]:
ldn_p = parse_dow_hod(ldn_pers_df)

In [34]:
ldn_p = ldn_p.set_index('timestamp')

In [37]:
from pandarallel import pandarallel

In [38]:
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [39]:
def get_sensor_uuids(row, k_closest,distorid):
    
    def havesine(lon1, lat1, lon2, lat2):
        """
        Calculate the great circle distance between two points 
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles
        return c * r
    
    
    lon1 = row['gpsLongitude']
    lat1 = row['gpsLatitude']
    
    haves = {}
    for idx, rows in london_sensors.iterrows():
        haves[idx] = havesine(lon1,lat1,rows[0],rows[1])
    
    
    x = sorted(((v,k) for k,v in haves.items()))
    uuid = x[k_closest][1]
    dist = x[k_closest][0]
    #print(dist)
    #uuid = median_sensors['UUID'].iloc[[indic]].values
    
    if (distorid == 'id'):
        return uuid
    if (distorid == 'dist'):
        return dist

In [42]:
london_sensors = london_sensors.drop("Unnamed: 0", axis = 1)
london_sensors = london_sensors.set_index("UUID")

In [43]:
inter = ldn_p.copy()

In [44]:
inter['closest_UUID'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,0, 'id'), axis=1)
inter['dist_closest_s'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,0, 'dist'), axis=1)
inter['2_closest_UUID'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,1, 'id'), axis=1)
inter['2_dist_closest_s'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,1, 'dist'), axis=1)
inter['3_closest_UUID'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,2, 'id'), axis=1)
inter['3_dist_closest_s'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,2, 'dist'), axis=1)
inter['4_closest_UUID'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,3, 'id'), axis=1)
inter['4_dist_closest_s'] = inter.parallel_apply(lambda row: get_sensor_uuids(row,3, 'dist'), axis=1)

In [45]:
inter = inter.reset_index()

In [46]:
def pmt(row):
    timestamp = row['timestamp']
    unix = timestamp.timestamp()
    minus = unix-120
    plus = unix+120
    c_minus = pd.Timestamp(minus, unit='s', tz='Europe/London')
    c_plus = pd.Timestamp(plus, unit='s', tz='Europe/London')
    first = str(c_minus).split()[1].split('-')[0]
    last = str(c_plus).split()[1].split('-')[0]

    return first, last

In [63]:
def closest_pm_value(row, dataframe):
    uuid1 = row['closest_UUID']#.item()
    dist1 = row['dist_closest_s']
    uuid2 = row['2_closest_UUID']
    dist2 = row['2_dist_closest_s']    
    uuid3 = row['3_closest_UUID']    
    dist3 = row['3_dist_closest_s']
    uuid4 = row['4_closest_UUID']    
    dist4 = row['4_dist_closest_s']

    ts = row['timestamp']
    date = str(ts.date())
    
    first = ts - timedelta(seconds=80)
    last = ts + timedelta(seconds=80)
    first = str(first.time())
    last = str(last.time())
    

    try:
        pm, humidity = dataframe[dataframe.UUID == uuid1][date].between_time(first, last)[['pm2_5', 'humidity']]
        return pd.Series([pm[0], humidity, dist1, uuid1]) 
    except:
        try:
            pm, humidity = dataframe[dataframe.UUID == uuid2][date].between_time(first, last)[['pm2_5', 'humidity']]
            return pd.Series([pm[0], humidity, dist2, uuid2])
        except:
            try:
                pm, humidity = dataframe[dataframe.UUID == uuid3][date].between_time(first, last)[['pm2_5', 'humidity']]
                return pd.Series([pm[0], humidity, dist3, uuid3]) 
            except:
                try:
                    pm, humidity = dataframe[dataframe.UUID == uuid4][date].between_time(first, last)[['pm2_5', 'humidity']]
                    return pd.Series([pm[0], humidity, dist4, uuid4]) 
                except:
                    pm, humidity = 0, 0
                    return pd.Series([pm, 0,'no sensor'])

In [82]:
london_s = london_s.set_index("timestamp")

In [83]:
dtp_inter = inter.copy()

In [84]:
dtp_inter = dtp_inter.reset_index()

In [85]:
dtp_inter = dtp_inter.drop("index", axis=1)

In [86]:
dtp_inter['timestamp'] = dtp_inter['timestamp'].apply(lambda x: pd.to_datetime(x).tz_convert('Europe/London'))

In [87]:
dtp_inter[['closest_pm','humidity','dist_to_closest_pm', 'closest_pm_id']] = dtp_inter.apply(lambda row: closest_pm_value(row, london_s), axis=1)

In [89]:
dtp_inter = dtp_inter[~dtp_inter.closest_pm.eq(0)]

In [90]:
def clean_df(df):
    df = df.filter(items=['timestamp','walk','pm2_5','hour_of_day','day_of_week' ,'temperature', 'humidity','gpsLongitude', 'gpsLatitude', 'closest_pm', 'dist_to_closest_pm', 'closest_pm_id'])
    return df

In [91]:
ldn = clean_df(dtp_inter)

In [278]:
ldn = ldn.set_index("timestamp")

In [318]:
ldn2.to_csv("london_walk.csv")

In [314]:
ldn2 = ldn.copy()

In [116]:
import branca
import branca.colormap as cm

In [120]:
def walk_mapper(df, static):

    
    static_points =[]
    pms= []
    
   
    
    for idx, rows in static.iterrows():
        sgps_points = [rows.gpsLatitude, rows.gpsLongitude]
        static_points.append(sgps_points)        
    


    map_ = folium.Map(location=static_points[0], zoom_start=10.5)#, tiles="Stamen Toner")

    colormap = colormap = branca.colormap.step.YlOrRd_09.scale(0, 30)

    lat = list(df.gpsLatitude)
    lon = list(df.gpsLongitude)
    pms = list(df.pm2_5)


    for loc, p in zip(zip(lat, lon), pms):
        folium.Circle(
            location=loc,
            radius=35,
            fill=True,
            color=colormap(p),
            fill_opacity=1
        ).add_to(map_)

    map_.add_child(colormap)
    
   
    for p in static_points:    
        folium.Marker(p, icon=folium.Icon(color='red', icon='home', prefix='fa')).add_to(map_)
    map_
    return map_

In [425]:
walk_mapper(ldn, london_sensors)

# Outlier removal

In [None]:
from scipy import stats

def drop_numerical_outliers(df, z_thresh=1.5):
    # Constrains will contain `True` or `False` depending on if it is a value below the threshold.
    constrains = df.select_dtypes(include=[np.number]) \
        .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh) \
        .all(axis=1)
    # Drop (inplace) values set to be rejected
    df.drop(df.index[~constrains], inplace=True)

In [None]:
drop_numerical_outliers(peeps)

In [None]:
drop_numerical_outliers(leon)

In [None]:
drop_numerical_outliers(guadalajara)