In [1]:
# We need to create a SQLite database
# I'm going to use a API, but you can use CSVs

# Load data into a PANDAS dataframe -> load into the database

#### Load Dependencies

In [2]:
# data science
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import scipy.stats as stats

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func

#### Read in Data File

In [3]:
# Create a file path
filepath = "./Resources/scrubbed.csv"

# Read in the data. 
df = pd.read_csv(filepath)

df.head()

  df = pd.read_csv(filepath)


Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


#### Data Cleaning

In [4]:
# Fill nulls with 'unknown'
df.country = df.country.fillna('unknown')
df.state = df.state.fillna('unknown')
df["shape"] = df["shape"].fillna('unknown')
df["comments"] = df["comments"].fillna('no comment')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              80332 non-null  object 
 1   city                  80332 non-null  object 
 2   state                 80332 non-null  object 
 3   country               80332 non-null  object 
 4   shape                 80332 non-null  object 
 5   duration (seconds)    80332 non-null  object 
 6   duration (hours/min)  80332 non-null  object 
 7   comments              80332 non-null  object 
 8   date posted           80332 non-null  object 
 9   latitude              80332 non-null  object 
 10  longitude             80332 non-null  float64
dtypes: float64(1), object(10)
memory usage: 6.7+ MB


In [5]:
# Create a pared down list of shapes
item_shape = ['hexagon', 'delta', 'round', 'crescent', 'pyramid', 'flare', 'dome', 'changed', 'cross', 'cone']
df = df[~df["shape"].isin(item_shape)]

In [6]:
df = df[df["country"] == 'us']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64666 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              64666 non-null  object 
 1   city                  64666 non-null  object 
 2   state                 64666 non-null  object 
 3   country               64666 non-null  object 
 4   shape                 64666 non-null  object 
 5   duration (seconds)    64666 non-null  object 
 6   duration (hours/min)  64666 non-null  object 
 7   comments              64666 non-null  object 
 8   date posted           64666 non-null  object 
 9   latitude              64666 non-null  object 
 10  longitude             64666 non-null  float64
dtypes: float64(1), object(10)
memory usage: 5.9+ MB


In [7]:
df['datetime'] = pd.to_datetime(df['datetime'], format='mixed', errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64666 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              64183 non-null  datetime64[ns]
 1   city                  64666 non-null  object        
 2   state                 64666 non-null  object        
 3   country               64666 non-null  object        
 4   shape                 64666 non-null  object        
 5   duration (seconds)    64666 non-null  object        
 6   duration (hours/min)  64666 non-null  object        
 7   comments              64666 non-null  object        
 8   date posted           64666 non-null  object        
 9   latitude              64666 non-null  object        
 10  longitude             64666 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 5.9+ MB


In [8]:
df = df.dropna(subset=['datetime'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64183 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              64183 non-null  datetime64[ns]
 1   city                  64183 non-null  object        
 2   state                 64183 non-null  object        
 3   country               64183 non-null  object        
 4   shape                 64183 non-null  object        
 5   duration (seconds)    64183 non-null  object        
 6   duration (hours/min)  64183 non-null  object        
 7   comments              64183 non-null  object        
 8   date posted           64183 non-null  object        
 9   latitude              64183 non-null  object        
 10  longitude             64183 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 5.9+ MB


In [9]:
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64183 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              64183 non-null  datetime64[ns]
 1   city                  64183 non-null  object        
 2   state                 64183 non-null  object        
 3   country               64183 non-null  object        
 4   shape                 64183 non-null  object        
 5   duration (seconds)    64183 non-null  object        
 6   duration (hours/min)  64183 non-null  object        
 7   comments              64183 non-null  object        
 8   date posted           64183 non-null  object        
 9   latitude              64183 non-null  float64       
 10  longitude             64183 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(8)
memory usage: 5.9+ MB


In [10]:
df['duration (seconds)'] = pd.to_numeric(df['duration (seconds)'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64183 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              64183 non-null  datetime64[ns]
 1   city                  64183 non-null  object        
 2   state                 64183 non-null  object        
 3   country               64183 non-null  object        
 4   shape                 64183 non-null  object        
 5   duration (seconds)    64181 non-null  float64       
 6   duration (hours/min)  64183 non-null  object        
 7   comments              64183 non-null  object        
 8   date posted           64183 non-null  object        
 9   latitude              64183 non-null  float64       
 10  longitude             64183 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 5.9+ MB


In [11]:
df = df.dropna(subset=['duration (seconds)'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64181 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              64181 non-null  datetime64[ns]
 1   city                  64181 non-null  object        
 2   state                 64181 non-null  object        
 3   country               64181 non-null  object        
 4   shape                 64181 non-null  object        
 5   duration (seconds)    64181 non-null  float64       
 6   duration (hours/min)  64181 non-null  object        
 7   comments              64181 non-null  object        
 8   date posted           64181 non-null  object        
 9   latitude              64181 non-null  float64       
 10  longitude             64181 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 5.9+ MB


In [12]:
df = df[df['datetime'].dt.year >=1994]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58560 entries, 59 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetime              58560 non-null  datetime64[ns]
 1   city                  58560 non-null  object        
 2   state                 58560 non-null  object        
 3   country               58560 non-null  object        
 4   shape                 58560 non-null  object        
 5   duration (seconds)    58560 non-null  float64       
 6   duration (hours/min)  58560 non-null  object        
 7   comments              58560 non-null  object        
 8   date posted           58560 non-null  object        
 9   latitude              58560 non-null  float64       
 10  longitude             58560 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 5.4+ MB


#### Below is the remaining starter code

In [13]:
# Create engine using the `ufo.sqlite` database file
engine = create_engine("sqlite:///ufo.sqlite")

In [14]:
# write to the database
df.to_sql("ufo", con=engine, index=False, method=None, if_exists="replace")

58560

In [15]:
# INSPECT to confirm existence

# Create the inspector and connect it to the engine
inspector_gadget = inspect(engine)

# Collect the names of tables within the database
tables = inspector_gadget.get_table_names()

# print metadata for each table
for table in tables:
    print(table)
    print("-----------")
    
    # get columns
    columns = inspector_gadget.get_columns(table)
    for column in columns:
        print(column["name"], column["type"])

    print()

ufo
-----------
datetime DATETIME
city TEXT
state TEXT
country TEXT
shape TEXT
duration (seconds) FLOAT
duration (hours/min) TEXT
comments TEXT
date posted TEXT
latitude FLOAT
longitude  FLOAT



In [16]:
# close the engine
engine.dispose()