In [None]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta as time_delta
from ufo_downloader import UFOData, LocationFinder
from ufo_parser import DurationParser
from json import JSONDecodeError

In [None]:
dataclass = UFOData('data.json')
finder = LocationFinder('location_cache.json', 'uscitiesv1.3.csv')
finder.http_sleep = 1
durationParser = DurationParser()

In [None]:
def process_raw_data(raw_data, counter):
    try:
        # Parse duration: will not throw exception on bad input, instead return None
        duration = DurationParser.parse(raw_data['Duration'])
        if duration is None:
            duration = np.nan
        
        # It will both raise ValueError or return None for latitude and longitude. This is a design issue.
        try:
            city, location = finder.find_ugly(raw_data['City'], raw_data['State'])
        except JSONDecodeError as de:
            # This has to be catched first!
            raise Exception('Connection may be dead...')
        except ValueError as ve:
            city = raw_data['City']
            location = {
                'latitude': np.nan,
                'longitude': np.nan,
                'confidence': -1
            }
            if raw_data['City'] != '':
                print(ve)
                
            
        # It will raise ValueError if a string can not recignised as a date.
        try:
            date = pd.to_datetime(raw_data['Date'])
            if date > datetime.now():
                date -= time_delta(years = 100)
        except ValueError as ve:
            date = np.nan
            print(str(ve) + raw_data['Date'])
        
        counter += 1
        if counter % 10000 == 0:
            print(str(counter) + ' items processed.')
            finder.save_cache()
        
        return {
            'City_original': raw_data['City'],
            'City_found': city,
            'Date_original': raw_data['Date'],
            'Date_parsed': pd.to_datetime(date),
            'Duration_original': raw_data['Duration'],  # automatically converted to timedelta64
            'Duration_parsed': duration,
            'Latitude': pd.to_numeric(location['latitude'] if 'latitude' in location else np.nan, errors='coerce'),
            'Longitude': pd.to_numeric(location['longitude'] if 'longitude' in location else np.nan, errors='coerce'),
            'Location_confidence': pd.to_numeric(location['confidence']),
            'Link': raw_data['link'],
            'Shape': raw_data['Shape'],
            'State': raw_data['State'],
            'Summary': raw_data['Summary'],
        }
    except Exception as e:
        print('Something went wrong while processing the following record:')
        print(raw_data)
        # Raise exception to caller....
        raise e

In [None]:
counter = 0
frame = pd.DataFrame([process_raw_data(row, counter) for row in dataclass.data])
frame.to_csv('data_new_20_30.csv', sep=";", encoding="UTF8")
finder.save_cache()

In [None]:
frame.dtypes  # check if data types are correct...