In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta as time_delta
from ufo_downloader import UFOData, LocationFinder
from ufo_parser import DurationParser
from json import JSONDecodeError

In [2]:
dataclass = UFOData('data.json')
finder = LocationFinder('location_cache.json', 'uscitiesv1.3.csv')
durationParser = DurationParser()

In [12]:
def process_raw_data(raw_data):
    global counter
    try:
        # Parse duration: will not throw exception on bad input, instead return None
        duration = DurationParser.parse(raw_data['Duration'])
        if duration is None:
            duration = np.nan
        
        # It will both raise ValueError or return None for latitude and longitude. This is a design issue.
        try:
            city, location = finder.find_ugly(raw_data['City'], raw_data['State'])
        except JSONDecodeError as de:
            # This has to be catched first!
            raise Exception('Connection may be dead...')
        except ValueError as ve:
            city = raw_data['City']
            location = {
                'latitude': np.nan,
                'longitude': np.nan,
                'confidence': -1
            }
            if raw_data['City'] != '':
                print(ve)
                
        # It will raise ValueError if a string can not recignised as a date.
        try:
            date = pd.to_datetime(raw_data['Date'])
            if date > datetime.now():
                date -= time_delta(years = 100)
        except ValueError as ve:
            date = np.nan
            print(str(ve) + raw_data['Date'])
        
        counter += 1
        if counter % 1000 == 0:
            print(str(counter) + ' items processed.')
            finder.save_cache()
        
        return {
            'City_original': raw_data['City'],
            'City_found': city,
            'Date_original': raw_data['Date'],
            'Date_parsed': pd.to_datetime(date),
            'Duration_original': raw_data['Duration'],  # automatically converted to timedelta64
            'Duration_parsed': duration,
            'Latitude': pd.to_numeric(location['latitude'] if 'latitude' in location else np.nan, errors='coerce'),
            'Longitude': pd.to_numeric(location['longitude'] if 'longitude' in location else np.nan, errors='coerce'),
            'Location_confidence': pd.to_numeric(location['confidence']),
            'Link': raw_data['link'],
            'Shape': raw_data['Shape'],
            'State': raw_data['State'],
            'Summary': raw_data['Summary'],
        }
    except Exception as e:
        print('Something went wrong while processing the following record:')
        print(raw_data)
        # Raise exception to caller....
        raise e

In [13]:
counter = 0
frame = pd.DataFrame([process_raw_data(row) for row in dataclass.data])
frame.to_csv('data_new.csv', sep=";", encoding="UTF8")
finder.save_cache()

Can not find city: Paul’s Valley (near)
Unknown string formatmd 9 10:10
Can not find city: Island Læsø (Laesoe)(Denmark)
1000 items processed.
Can not find city: BAYRAMÇ HACIBEKRLER KÖYÜ (Turkey)
Can not find city: ((Unknown))
Can not find city: Suliszów, Wietokrzyskie (Poland)
2000 items processed.
Can not find city: ((Location no revealed by witness))
Out of bounds nanosecond timestamp: 1-01-04 23:22:004/ 23:22
3000 items processed.
Can not find city: Cottage Grove & Madison
4000 items processed.
Can not find city: Lake Umatilla; John Day Dam (~30 mi. E of;  approximate)
Can not find city: Curaçao (Mexico)
5000 items processed.
Can not find city: Québec City (Canada)
6000 items processed.
Can not find city: Málaga (Granada)(Spain)
Can not find city: San Luis Potosí (Mexico)
7000 items processed.
Can not find city: Vancouver & Portland
8000 items processed.
Can not find city: Chajarí Entre Ríos (Argentina)
Can not find city: Jacksonville Flat & Atlanta (between)
Can not find city: São

Can not find city: São Paulo (Brazil)
82000 items processed.
Can not find city: Gateshead,Tyne & Wear (UK/England)
83000 items processed.
Can not find city: São Paulo (Brasil)
84000 items processed.
Can not find city: San Marcos & Escondido
Can not find city: Mayagüez
Can not find city: Hwy. 6 & 68 (Canada)
Can not find city: Ísafjörður (Iceland)
Can not find city: (Unspecified)
85000 items processed.
86000 items processed.
Can not find city: San José (Costa Rica)
Can not find city: Gila Bend- Yuma?
87000 items processed.
Can not find city: Gershøj (Denmark)
Can not find city: Norrköping (Sweden)
Can not find city: South Plainfield?
88000 items processed.
Can not find city: Joanópolis (Brasil)
Can not find city: St-Rémi (Canada)
Can not find city: Ciel de St-Rémi (Canada)
Can not find city: Kansas ??  (In-flight sighting)
Can not find city: Stony Plain & Spruce Grove (Canada)
Can not find city: Pärnu (Estonia)
89000 items processed.
Can not find city: Réunion Island (France)
Can not fi

Can not find city: Mexicali B. C. Mexico; Sierra Cucapah; Cerro prieto., B. C.


In [14]:
frame.dtypes  # check if data types are correct...

City_found                      object
City_original                   object
Date_original                   object
Date_parsed             datetime64[ns]
Duration_original               object
Duration_parsed        timedelta64[ns]
Latitude                       float64
Link                            object
Location_confidence              int64
Longitude                      float64
Shape                           object
State                           object
Summary                         object
dtype: object

In [16]:
len(frame[frame.Location_confidence == 3]) / len(frame)

0.8061137143815247