## Import Dependencies

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../src/data')

import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import plot_confusion_matrix 
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split 
from sklearn.inspection import permutation_importance, plot_partial_dependence

from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline

from xml2dict import *
from dict2tabular import *

In [3]:
def xml2soup(xml_path: str):
    """ Loads xml into BeautifulSoup object.

    Params:
        xml_path (string) - Path to xml to be loaded

    Returns:
        soup (BeautifulSoup object)
    """
    xml_content = requests.get(xml_path)
    soup = BeautifulSoup(xml_content.content, 'lxml')
    return soup

def xml_extract_metadata(xml_soup):
    """ Extracts metadata for identification (station, time and location) from top of xml.

    Params:
        output_dict (dict) - Dictionary to add information to
        xml_soup (BeautifulSoup object) - beautifulSoup object containing the loaded xml information.

    Returns:
        output_dict (dict) - Updated dictionary with added metadata information
    """
    output_dict = dict()
    identification_elements = xml_soup.find("identification-elements")
    id_element_list = ['date_time', 'tc_identifier', 'station_name', 'station_elevation',
                       'latitude', 'longitude', 'version', 'correction', 'source_uri']
    for id_element in id_element_list:
        try:
            output_dict[id_element] = identification_elements.findChild(name='element',
                                                                        attrs={'name': id_element}
                                                                        ).get("value")
        except AttributeError as error:
            print(error, ". Possibly element '%s' is missing." % id_element)

    # Station identifier are missing in some xml files
    try:
        output_dict['station_identifier'] = identification_elements.findChild(name='element',
                                                                              attrs={'name': 'station_identifier'}
                                                                              ).get("value")
    except AttributeError as error:
        print('error sigma')

    observation_elements = xml_soup.find('om:result').find('elements').findAll('element')
    for each_elem in observation_elements:
        
        try:
            if each_elem['element-index']:
                
                dict_key = each_elem['name']+'_'+each_elem['orig-name']
                output_dict[dict_key+'_value'] = each_elem['value']
                #print(each_elem['orig-name'])
                #print(each_elem['value'])
                
                qc_soup = each_elem.find('quality-controlled')
                qc_summary_dict_key = dict_key+'_'+qc_soup.find('element')['name']
                output_dict[qc_summary_dict_key] = qc_soup.find('element')['value']
                
                # qc native tag
                qc_native = qc_soup.find('native').findAll('qualifier')
                for each_native in qc_native:
                    try:
                        output_dict[dict_key+'_'+each_native['name']] = each_native['value']
                    except:
                        print('error beta')
                        continue
                
                #print(output_dict)
                #print(qc_soup.find('real-time'))
                qc_element_list = qc_soup.find('real-time').find('element').findAll('element', recursive=False)
                #print(qc_element_list)
                for each_qc in qc_element_list:
                    try:
                        qc_dict_key = dict_key+'_qa-'+each_qc['name']
                        output_dict[qc_dict_key] = each_qc['value']
                    
                        qc_detail = each_qc.findAll('element')

                        for qc_det_item in qc_detail:
                            try:
                                qc_det_name = dict_key+'_qc-'+qc_det_item['name']+'_'+qc_det_item['value'].split('/')[6]
                                output_dict[qc_det_name] = qc_det_item.find('qualifier', {'name' : 'flag_value'})['value']
                            except:
                                print("error gamma")
                                continue
                    except:
                        print('error gamma')
                        continue
        except:

            continue
       # print(output_dict)

    #finally:
    return output_dict

## Import Live Data

In [4]:
# Run iff you want to look through multiple days on DMS
items = []
for i in range(1, 6):

    content_url = 'http://dms.cmc.ec.gc.ca:8180/notification?path=/msc/observation/atmospheric/surface_weather/ca-1.1-ascii/decoded_qa_enhanced-xml-2.0&time='+str(i)+'d'
    print(content_url)
    content_data = requests.get(content_url)
    html = xml2soup(content_url)
    item = html.findAll('item')
    print(item)
    items.extend(item)

http://dms.cmc.ec.gc.ca:8180/notification?path=/msc/observation/atmospheric/surface_weather/ca-1.1-ascii/decoded_qa_enhanced-xml-2.0&time=1d


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
#station_id = ['zrp', 'mfj', 'nbi', 'ngh', 'nsg', 'wfz', 'wic', 'wyh', 'xcm', 'zhk', 'zrp', 'eqi']

station_id = ['nco', 'nek', 'zrp', 'mfj', 'zpk', 'nbi', 'eqi', 'ngh', 'nsg', 'wfz', 'wyh', 'wkx', 'gah', 'ads', 'wdw', 'wzv', 'wic', 'ndt', 'wsy', 'won', 'way', 'xcm', 'xux', 'xet', 'zlt', 'wij', 'wjc', 'nzs', 'wnv', 'wst', 'zcr', 'zel', 'xbl', 'mfm', 'acq', 'pjm', 'pif', 'who', 'wpz', 'xgd', 'zhk', 'pqw', 'wdv', 'wfp', 'wnz', 'xdi', 'wvt', 'pyq', 'apr', 'xqh', 'xfb', 'wpk', 'nvq', 'wct', 'wzg', 'xmm', 'zcy', 'web', 'wgd', 'xeg', 'vqz', 'xse', 'vxy', 'pqd', 'zvm', 'pzh', 'mjk', 'xox', 'zsm', 'aqy', 'ncd', 'zdb','xrb', 'xqb', 'zhy', 'abf', 'wgr',  'xnp', 'wdc', 'xwf', 'wsk', 'ahr', 'xar', 'zfs', 'wvc', 'xoa', 'ple', 'zsp', 'wtd', 'wrk', 'wsn',  'zhb', 'xmw', 'mrf', 'xto', 'ppr', 'nbb', 'xat', 'erm', 'xhi', 'zka', 'nco', 'adl', 'zoc', 'wyj', 'asb', 'apb', 'xzv', 'xha', 'xzc', 'wwn', 'wcf', 'xka', 'xrg', 'tze', 'wfz', 'zev', 'pgf', 'xtl', 'ybg', 'yyr', 'yod', 'yoj', 'ygq', 'yzt', 'ygl', 'yth', 'yvp', 'yxy', 'otl', 'wch', 'wxp', 'wzt',         ]

#station_id = ['nco', 'nek']

df = pd.DataFrame()
url_list = list()

for each in items:

    # get url of each observation and ignore supporting xmls (supp_1440)
    if each.find('title').contents[0].split('/')[2] in station_id and 'supp_1440' not in each.find('title').contents[0]:
        url_list.append(each.contents[2])
        #parse url and create a table with each row as observation, and element value as columns
        soupu = xml2soup(each.contents[2])
        extracted = xml_extract_metadata(soupu)
        df = df.append(extracted, ignore_index=True)

In [21]:
# drop if any columns contains null
newDf = df.dropna(how='any', axis=1)
newDf.columns

Index(['date_time', 'tc_identifier', 'station_name', 'station_elevation',
       'latitude', 'longitude', 'version', 'correction', 'source_uri',
       'station_identifier',
       ...
       'cumulative_precipitation_gauge_weight_unfiltered_3020_value',
       'cumulative_precipitation_gauge_weight_unfiltered_3020_overall_qa_summary',
       'cumulative_precipitation_gauge_weight_unfiltered_3020_error',
       'cumulative_precipitation_gauge_weight_unfiltered_3020_suspect',
       'cumulative_precipitation_gauge_weight_unfiltered_3020_suppressed',
       'cumulative_precipitation_gauge_weight_unfiltered_3021_value',
       'cumulative_precipitation_gauge_weight_unfiltered_3021_overall_qa_summary',
       'cumulative_precipitation_gauge_weight_unfiltered_3021_error',
       'cumulative_precipitation_gauge_weight_unfiltered_3021_suspect',
       'cumulative_precipitation_gauge_weight_unfiltered_3021_suppressed'],
      dtype='object', length=298)

## Import Trained Snow Data

In [22]:
X = read_pickle(file='../anomalydetection/data/processed1/df_snow_depth_3022_2019.pickle')
X_2019 = read_pickle(file='../anomalydetection/data/processed1/df_snow_depth_3022_2020.pickle')
X_2020 = read_pickle(file='../anomalydetection/data/processed1/df_snow_depth_3022_2021_jan_jun.pickle')

Xsnow = pd.concat([X, X_2019], ignore_index=True, sort=False)
Xsnow = pd.concat([X, X_2020], ignore_index=True, sort=False)

In [23]:
Xsnow = Xsnow.dropna(how='any', axis=1)

## Import Trained Wind Data

In [49]:
Xwind = read_pickle(r'C:\Users\filipovicha\Documents\AI_Project\moov-ai-automatic-qc\data\processed_before_nf\wind_speed_3005.pickle')

In [50]:
Xwind = Xwind.dropna(how='any', axis=1)

## Import Trained Precip Data

In [34]:
Xprecip = read_pickle(r'C:\Users\filipovicha\Documents\AI_Project\moov-ai-automatic-qc\data\processed_before_nf\precipitation_amount_285.pickle')

In [35]:
Xprecip = Xprecip.dropna(how='any', axis=1)

## Detect Snow Depth Anomalies

In [24]:
common_snow_elems = list(np.intersect1d(Xsnow.columns, newDf.columns))
test_real_time = newDf[common_snow_elems] 
common_snow_elems.append('snow_depth_3022_target')
train_real_time = Xsnow[common_snow_elems]
train_y = train_real_time['snow_depth_3022_target']

In [25]:
# keeping the the date and time for anomaly detection
train_real_time['hour_of_day'] = train_real_time.date_time.dt.hour
train_real_time['yearz'] = train_real_time.date_time.dt.year
train_real_time['monthz'] = train_real_time.date_time.dt.month
train_real_time['dayz'] = train_real_time.date_time.dt.day

test_real_time['hour_of_day'] = train_real_time.date_time.dt.hour
test_real_time['yearz'] = train_real_time.date_time.dt.year
test_real_time['monthz'] = train_real_time.date_time.dt.month
test_real_time['dayz'] = train_real_time.date_time.dt.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_real_time['hour_of_day'] = train_real_time.date_time.dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_real_time['yearz'] = train_real_time.date_time.dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_real_time['monthz'] = train_real_time.date_time.dt.month
A value is try

In [26]:
del_cols = ['origin_filename', 'station_time_identifier', 'tc_identifier', 'date_time',
            'station_name', 'version', 'correction', 'source_uri', 'station_identifier', '_merge']


train_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 
test_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 

# removing target variable form training set
train_real_time.drop('snow_depth_3022_target', axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [27]:
# The Actual real-Time Train and Test
rfc_estimator2 = rfc(n_estimators=1000, 
                    min_samples_leaf=2, 
                    n_jobs=7)
sampler = ADASYN()
clf = make_pipeline(sampler, rfc_estimator2)
    
rezult = clf.fit(train_real_time, train_y).predict(test_real_time)

In [29]:
# merging the prediction/classification to the observations
classification = list(rezult)
test_real_time['classes'] = classification
test_real_time['url'] = url_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_real_time['classes'] = classification
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_real_time['url'] = url_list


In [30]:
overturn = test_real_time.loc[test_real_time['classes'] == 1]
snow_list = overturn['url']

## Detect Wind Speed Anomalies

In [56]:



common_wind_elems = list(np.intersect1d(Xwind.columns, newDf.columns))
test_real_time = newDf[common_wind_elems] 
common_wind_elems.append('wind_speed_3005_target')
train_real_time = Xwind[common_wind_elems]
train_y = train_real_time['wind_speed_3005_target']

In [57]:
test_real_time = test_real_time.drop([col for col in test_real_time.columns if test_real_time[col].eq('MSNG').any()], axis=1)
common_elems = list(np.intersect1d(test_real_time.columns, train_real_time.columns))
common_elems.append('wind_speed_3005_target')
train_real_time = train_real_time[common_elems]

In [58]:
del_cols = ['origin_filename', 'station_time_identifier', 'tc_identifier', 'date_time',
            'station_name', 'version', 'correction', 'source_uri', 'station_identifier', '_merge']


train_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 
test_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 

# removing target variable form training set
train_real_time.drop('wind_speed_3005_target', axis = 1, inplace=True)

In [59]:
# The Actual real-Time Train and Test
rfc_estimator2 = rfc(n_estimators=1000, 
                    min_samples_leaf=2, 
                    n_jobs=7)
sampler = ADASYN()
clf = make_pipeline(sampler, rfc_estimator2)
    
rezult = clf.fit(train_real_time, train_y).predict(test_real_time)

In [60]:
# merging the prediction/classification to the observations
classification = list(rezult)
test_real_time['classes'] = classification
test_real_time['url'] = url_list

In [61]:
overturn = test_real_time.loc[test_real_time['classes'] == 1]
wind_list = overturn['url']

## Detect Precip Anomalies

In [42]:
common_precip_elems = list(np.intersect1d(Xprecip.columns, newDf.columns))
test_real_time = newDf[common_precip_elems] 
common_precip_elems.append('precipitation_amount_285_target')
train_real_time = Xprecip[common_precip_elems]
train_y = train_real_time['precipitation_amount_285_target']

In [43]:
del_cols = ['origin_filename', 'station_time_identifier', 'tc_identifier', 'date_time',
            'station_name', 'version', 'correction', 'source_uri', 'station_identifier', '_merge']


train_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 
test_real_time.drop(del_cols , axis = 1, inplace=True, errors='ignore') 

# removing target variable form training set
train_real_time.drop('precipitation_amount_285_target', axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [44]:
# The Actual real-Time Train and Test
rfc_estimator2 = rfc(n_estimators=1000, 
                    min_samples_leaf=2, 
                    n_jobs=7)
sampler = ADASYN()
clf = make_pipeline(sampler, rfc_estimator2)
    
rezult = clf.fit(train_real_time, train_y).predict(test_real_time)

In [45]:
# merging the prediction/classification to the observations
classification = list(rezult)
test_real_time['classes'] = classification
test_real_time['url'] = url_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_real_time['classes'] = classification
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_real_time['url'] = url_list


In [46]:
overturn = test_real_time.loc[test_real_time['classes'] == 1]
precip_list = overturn['url']

## Present data from list output

In [31]:
def extract_info(source: str):
    newstring = source.rsplit('/')
    date = newstring[10] 
    stat = newstring[12]
    return [date, stat]
 
def make_dict(sources: list):
    dicto = {}
    for source in sources:
        stat = extract_info(source)[1]
        date = extract_info(source)[0]
        
        #check if stat is in sources:
        if stat not in dicto.keys():
            dicto[stat] = {'count': 1, 'earliest' : date, 'latest' : date}
            
        else:
            if date < dicto[stat]['earliest']:
                dicto[stat]['count'] = dicto[stat]['count'] + 1
                dicto[stat]['earliest'] = date
                
            elif date > dicto[stat]['latest']:
                dicto[stat]['count'] = dicto[stat]['count'] + 1
                dicto[stat]['latest'] = date
                
            else:
                dicto[stat]['count'] = dicto[stat]['count'] + 1
    
    return dicto 

In [62]:
snow_depth_anomalies = make_dict(snow_list)
wind_speed_anomalies = make_dict(wind_list)
precip_anomalies = make_dict(precip_list)

In [63]:
print('Snow Depth Anomalies')
print(snow_depth_anomalies)

print('Wind Speed Anomalies')
print(wind_speed_anomalies)

print('Precip Anomalies')
print(precip_anomalies)

Snow Depth Anomalies
{'zsm': {'count': 16, 'earliest': '202111122100', 'latest': '202111171600'}, 'xmw': {'count': 14, 'earliest': '202111130000', 'latest': '202111171600'}, 'xmm': {'count': 14, 'earliest': '202111130100', 'latest': '202111170600'}, 'wdc': {'count': 15, 'earliest': '202111122300', 'latest': '202111171600'}, 'wvc': {'count': 6, 'earliest': '202111141100', 'latest': '202111170500'}, 'pqw': {'count': 6, 'earliest': '202111122000', 'latest': '202111170700'}, 'zfs': {'count': 13, 'earliest': '202111122000', 'latest': '202111171300'}, 'wct': {'count': 16, 'earliest': '202111130500', 'latest': '202111171300'}, 'nbi': {'count': 2, 'earliest': '202111140900', 'latest': '202111152000'}, 'wjc': {'count': 2, 'earliest': '202111142300', 'latest': '202111160300'}, 'nco': {'count': 1, 'earliest': '202111161800', 'latest': '202111161800'}, 'xeg': {'count': 1, 'earliest': '202111142000', 'latest': '202111142000'}, 'nek': {'count': 2, 'earliest': '202111122300', 'latest': '202111150200'