In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import bs4
from dateutil.parser import parse as parse_date
import numpy as np
from datetime import datetime
import re
import json

In [114]:
pdate = parse_date("Incident on 2019-12-19 22:49 UTC".split("on")[1])
pdate.replace(year=2018)

datetime.datetime(2018, 12, 19, 22, 49, tzinfo=tzutc())

In [156]:
# Parser for github status till september 2, 2020
# https://github.blog/2020-09-02-github-availability-report-august-2020/

def get_incident_year(incident_title):
    try:
        current_node = incident_title.parent.previous_sibling
        while type(current_node) != bs4.element.Tag or current_node.find(attrs={'data-var':'year'}) == None:
            current_node = current_node.previous_sibling
        return int(current_node.find(attrs={'data-var':'year'}).get_text())
    except:
        print(incident_title.parent.previous_sibling.previous_sibling)
        raise

def get_incident_type(incident_title):
    classes = incident_title.attrs['class']
    if 'impact-minor' in classes:
        return 'minor'
    elif 'impact-none' in classes:
        return 'minor'
    elif 'impact-major' in classes:
        return 'major'
    elif 'impact-critical' in classes:
        return 'critical'
    elif 'impact-maintenance' in classes:
        return 'maintenance'
    else:
        raise Exception('Unknown impact class: ' + str(incident_title))
        
def get_maintenance_incident_details(incident_update):
    raise Exception('Unimplemented. Implement when we encounter one')
    
def get_normal_incident_details(incident_updates, year):
    try:
        problem = False
        investigating_divs = incident_updates.select('.investigating')
        if len(investigating_divs) == 1:
            incident_start_time_str = investigating_divs[0].select('small')[0].get_text().strip()
            incident_start_time = parse_date(incident_start_time_str).replace(year=year).timestamp()
        else:
            incident_start_time = -1
            problem = True

        incident_end_time_str = incident_updates.select('.resolved')[0].select('small')[0].get_text().strip()
        incident_end_time = parse_date(incident_end_time_str).replace(year=year).timestamp()

        ret_data = {
            'event_start_time': np.int64(incident_start_time),
            'event_end_time': np.int64(incident_end_time)
        }
        
        if problem:
            ret_data['status'] = incident_updates.get_text()
            
        return ret_data
        
    except:
        print(incident_updates)
        raise

def process_github_page(file_obj, metadata):
    status_data = BeautifulSoup(file_obj.read(), 'lxml')
    
    incident_types = ['impact-minor', 'impact-none', 'impact-major', 'impact-critical', 'impact-maintenance']

    incidents_titles = status_data.findAll(attrs={'class': lambda x: x and 'incident-title' in x and any(incident_type in x for incident_type in incident_types)})
    incidents_updates = status_data.select('.updates-container')
    if len(incidents_titles) != len(incidents_updates):
        raise Exception('Number of titles does not match number of updates containers: ' + metadata['date'])
        
    extracted_incident_info = []
    for i in range(len(incidents_titles)):
        incident_type = get_incident_type(incidents_titles[i])
        incident_year = get_incident_year(incidents_titles[i])

        incident_details = None
        if incident_type == 'maintenance':
            incident_details = get_maintenance_incident_details(incidents_updates[i])
        else:
            incident_details = get_normal_incident_details(incidents_updates[i], incident_year)

        # Using incident field for error if required
        if 'status' not in incident_details:
            incident_details['status'] = incident_type

        extracted_incident_info.append(incident_details)

    return extracted_incident_info

status_data = None
with open('github-global-status.html') as f:
    status_data = process_github_page(f, {})
status_data

[{'event_start_time': 1576795740,
  'event_end_time': 1576797120,
  'status': 'minor'}]

In [123]:
status_data = None
with open('github-global-status.html') as f:
    status_data = process_github_page(f, {})
status_data

[{'event_start_time': 1576795740,
  'event_end_time': 1576797120,
  'status': 'minor'}]

In [125]:
import re
re.compile('[./]').split("/github-status/2020/11/20201107.zip")

['', 'github-status', '2020', '11', '20201107', 'zip']

In [55]:
def extract_individual_reports(file_obj):
    html = str(file_obj.read())
    if html is None:
        raise ValueError(f'ERROR: HTML IS NONE \n filename {filename} \t file_obj {file_obj} ')
        
    # Get list of reasons
    matches = re.search('"outage_subjects":(.*?)],"', html)
    list_of_reasons = json.loads(matches.group(1) + ']')
    map_of_reasons = {}
    for reason in list_of_reasons:
        map_of_reasons[reason['_id']] = reason['name']

    matches = re.search('"recentReports":(.*?)],"_', html)
    reports = []
    if matches is not None:
        # The data is in embedded javascript. Remove constructors to make it JSON
        # We used the closing brackets in the regex and don't capture them, adding them back again
        sanitized_match = re.sub(r'new Date\(".+?"\)', '""', matches.group(1)) + ']'
        reports = json.loads(sanitized_match)

    
    report_id = []
    report_country = []
    report_reason = []
    for datapoint in reports:
        try:
            report_id.append(datapoint['_id'])
            
            if 'country_code' in datapoint:
                report_country.append(datapoint['country_code'])
            elif 'countryCode' in datapoint:
                report_country.append(datapoint['countryCode'])
            else:
                raise Exception('Country code not found')
                
            if 'oSubjectId' in datapoint:
                report_reason.append(map_of_reasons[datapoint['oSubjectId']])
            else:
                reason_found = False
                for name, value in datapoint.items():
                    if 'subject' in name or 'Subject' in name:
                        report_reason.append(map_of_reasons[datapoint[name]])
                        reason_found = True
                if not reason_found:
                    report_reason.append('Unknown')
        except:
            print(datapoint)
            raise

    return pd.DataFrame({
        'id': report_id,
        'country':report_country,
        'reason': report_reason
    })

In [56]:
status_data = None
with open('outage_report_instagram.html') as f:
    status_data = extract_individual_reports(f)
status_data

Unnamed: 0,id,country,reason
0,5fc3c224dcd8580a7973cd8c,RU,Can't login
1,5fc3c11dfb995a344b9b68ef,TR,Newsfeed
2,5fc3bef2fb995a344b9b68ed,KZ,Mobile app crashes
3,5fc3bb19ae91e3734cd1777b,US,Can't upload pics
4,5fc3b738dcd8580a7973cd80,GR,Website down
5,5fc3b66bd4d3565ae19b14b5,RU,Mobile app not working


In [41]:
reports_obj

[{'_id': '5a8cb07dee5285d43965f834', 'name': 'Website down'},
 {'_id': '5a8cb06fee5285403b65f834', 'name': 'Mobile app not working'},
 {'_id': '5a8cb06fee5285403b65f835', 'name': 'Mobile app crashes'},
 {'_id': '5a8d3803ee52858b2065f837', 'name': 'Message read problems'},
 {'_id': '5a8d3803ee52858b2065f838', 'name': 'Message send problems'},
 {'_id': '5a8f0b78ee5285e04e16fca8', 'name': "Can't login"},
 {'_id': '5a9bdbea1b4314600050c082', 'name': 'Everything is down'},
 {'_id': '5a8f18e4ee5285ad7116fca7', 'name': "Can't upload pics"},
 {'_id': '5a912a4eee5285b07a16fcad', 'name': 'Hashtags not working'},
 {'_id': '5a91ab48ee5285e14216fca8', 'name': 'Newsfeed'}]

In [30]:
reports_obj[0]

{'_id': '5fc3c224dcd8580a7973cd8c',
 'serviceId': '5458ac234488de92aa9cda3a',
 'ip': '178.67.193.121',
 'oSubjectId': '5a8f0b78ee5285e04e16fca8',
 'isAmp': True,
 'uaHash': '38c958b1452dc5eda97de8e89acdfcba',
 'langCode': 'ru',
 'latitude': 68.9792,
 'longitude': 33.0925,
 'cityGeonameId': 524305,
 'cityName': 'Murmansk',
 'divisionCode': '49',
 'divisionGeonameId': 524304,
 'countryCode': 'RU',
 'countryGeonameId': 2017370,
 'continentCode': 'EU',
 'continentGeonameId': 6255148,
 'postalCode': '183006',
 'time': '',
 '__v': 0,
 'countryGeoname': {'_id': '5c883341232cdab94da82765',
  'geoname_id': 2017370,
  'name': 'Russian Federation',
  'latitude': 60,
  'longitude': 100,
  'feature_code': 'PCLI',
  'country_code': 'RU',
  'admin1_code': '00',
  'center': [60, 100],
  'alt_names': {'de': {'_id': '5c848dee3e5a333eadaa4547',
    'alt_geoname_id': 2419059,
    'geoname_id': 2017370,
    'lang_code': 'de',
    'is_preferred_name': True,
    'is_short_name': False,
    'is_colloquial': F