In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.chdir("..")
print("Working dir:", os.getcwd())

Working dir: d:\Projects\ufo_sightings_eda


In [3]:
ufo_df = pd.read_csv(
    "data/processed/ufo_clean.csv", parse_dates=["datetime", "date posted"]
)

In [None]:
ufo_df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,2004-04-27,29.883056,-97.941111
1,1949-10-10 21:00:00,lackland afb,tx,us,light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082
2,1955-10-10 17:00:00,chester (uk/england),,gb,circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667
3,1956-10-10 21:00:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.978333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.418056,-157.803611


In [5]:
ufo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80022 entries, 0 to 80021
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              80022 non-null  datetime64[ns]
 1   city                  80022 non-null  object        
 2   state                 74363 non-null  object        
 3   country               79980 non-null  object        
 4   shape                 80022 non-null  object        
 5   duration (seconds)    80022 non-null  float64       
 6   duration (hours/min)  80022 non-null  object        
 7   comments              80022 non-null  object        
 8   date posted           80022 non-null  datetime64[ns]
 9   latitude              80022 non-null  float64       
 10  longitude             80022 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(6)
memory usage: 6.7+ MB


In [6]:
ufo_df["datetime"].describe()

count                            80022
mean     2004-05-22 15:13:26.179800576
min                1906-11-11 00:00:00
25%                2001-08-05 04:45:00
50%                2006-11-24 11:22:30
75%                2011-06-23 23:47:30
max                2014-05-08 18:45:00
Name: datetime, dtype: object

Feature Engineering


Create features for detailed analysis: 
- year
- month
- weekday
- hour
- reporting lag
- reporting lag category for easy visualization
- duration (minutes)
- decade calculation for long-term trends

In [7]:
ufo_df["year"] = ufo_df["datetime"].dt.year
ufo_df["month"] = ufo_df["datetime"].dt.strftime("%b")
ufo_df["day"] = ufo_df["datetime"].dt.strftime("%a")
ufo_df["hour"] = ufo_df["datetime"].dt.hour
ufo_df["reporting lag (days)"] = (
    (ufo_df["date posted"] - ufo_df["datetime"]) / \
    pd.Timedelta(days=1)
    ).astype("Float64")

In [8]:
def reporting_category(report_days):
    if report_days < 1:
        return "Same day"
    elif report_days <= 7:
        return "1-7 days"
    elif report_days <= 30:
        return "8-30 days"
    elif report_days <= 90:
        return "31-90 days"
    elif report_days <= 365:
        return "91-365 days"
    elif report_days <= (3*365):
        return "1-3 years"
    else:
        return ">3 years"
    
ufo_df["reporting lag category"] = ufo_df["reporting lag (days)"].apply(reporting_category)

In [9]:
ufo_df["duration (minutes)"] = ufo_df["duration (seconds)"] / 60

In [10]:
# duration category
def duration_bucket(sec):
    if sec < 10:
        return "<10s"
    elif sec < 60:
        return "10–60s"
    elif sec < 300:
        return "1–5m"
    elif sec < 600:
        return "5–10m"
    elif sec < 1800:
        return "10–30m"
    elif sec < 3600:
        return "30–60m"
    elif sec < 10800:
        return "1–3h"
    elif sec < 21600:
        return "3–6h"
    elif sec < 43200:
        return "6–12h"
    elif sec <= 86400:            
        return "12–24h"
    else:
        return ">24h"
    
ufo_df["duration category"] = ufo_df["duration (seconds)"].apply(duration_bucket)

In [11]:
ufo_df.sample(5)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,year,month,day,hour,reporting lag (days),reporting lag category,duration (minutes),duration category
69527,2007-08-26 16:30:00,louisville,ky,us,sphere,120.0,2 minutes,Two metallic shperes hovering in broad dayligh...,2008-03-04,38.254167,-85.759444,2007,Aug,Sun,16,190.3125,91-365 days,2.0,1–5m
37221,2002-04-17 20:50:00,mt. carmel,pa,us,light,300.0,five minutes approx,2 light objects in the night sky not correspon...,2002-04-25,40.796944,-76.412222,2002,Apr,Wed,20,7.131944,8-30 days,5.0,5–10m
28065,1999-02-23 20:00:00,north carolina (southeast part),nc,us,sphere,10.0,5-10 seconds,while driving north on interstate 95&#44 my mo...,1999-04-02,35.759573,-79.0193,1999,Feb,Tue,20,37.166667,31-90 days,0.166667,10–60s
43890,2012-05-23 21:45:00,gardner,il,us,sphere,2.0,2 seconds,Low flying green blinking sphere,2012-05-29,41.185556,-88.309722,2012,May,Wed,21,5.09375,1-7 days,0.033333,<10s
55903,2013-07-13 17:00:00,ellensburg,wa,us,sphere,300.0,5 minutes,3 spheres circling around then moving to the w...,2013-07-14,46.996667,-120.546667,2013,Jul,Sat,17,0.291667,Same day,5.0,5–10m
