In [57]:
# Import Dependencies
import pandas as pd

In [58]:
# Create a reference the CSV file desired
csv_path = "Resources/ufoSightings.csv"

# Read the CSV into a Pandas DataFrame
ufo_df = pd.read_csv(csv_path)

# Print the first five rows of data to the screen
ufo_df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,`20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [64]:
clean_ufo_df.sort_values(['duration (seconds)','state']).head(30)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
4081,10/23/2008 4:45,remote,wy,,flash,0.001,0.001sec,brilliant strobe light at 4am&#44 moving light...,1/10/2009,-46.163992,169.87505
42378,5/15/1987 23:00,island lake,il,us,light,0.01,milliseconds,4 red laser like lines,1/12/2012,42.2761111,-88.191944
56596,7/15/1974 22:00,bridgeview,il,us,triangle,0.01,milliseconds,Triangular streak SW to NE horizon to horizon ...,1/17/2004,41.75,-87.804167
23754,12/9/1999 18:15,lyle,wa,us,fireball,0.01,millisecond,Incredible...full moon sized bright orange fir...,12/16/1999,45.6961111,-121.285
52996,6/30/2002 3:15,helsinki (finland),,,unknown,0.01,0.01sec,Overpassing UFO,7/1/2002,60.173324,24.941025
13375,11/28/2001 3:41,milwaukie,or,us,fireball,0.02,0.02 sec,High speed moving fireball fallow by a bright ...,12/5/2001,45.4463889,-122.638056
18735,12/17/2011 18:40,springhill,fl,,flash,0.05,.05 second,We saw a flash of light while observing a plan...,1/12/2012,28.483168,-82.536987
21258,12/29/2012 19:30,asheboro,nc,us,light,0.05,.05 seconds,Single bluish white downward arc of fast movin...,2/4/2013,35.7077778,-79.813889
7378,10/9/2012 23:30,woodbourne,ny,us,other,0.05,.05 seconds,A basket -shaped object&#44 intense orange at ...,10/30/2012,41.7597222,-74.594444
42057,5/13/2003 22:00,grove,oh,,unknown,0.05,.05 sec,bright flaring star,5/27/2003,41.158607,-81.22699


In [59]:
# Remove the rows with missing data
clean_ufo_df = ufo_df.dropna(how="all")
clean_ufo_df.count()

datetime                80332
city                    80332
state                   74535
country                 70662
shape                   78400
duration (seconds)      80332
duration (hours/min)    80332
comments                80317
date posted             80332
latitude                80332
longitude               80332
dtype: int64

In [60]:
clean_ufo_df.head()
clean_ufo_df.dtypes

datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

In [61]:
clean_ufo_df['duration (seconds)'] = clean_ufo_df['duration (seconds)'].str.replace('`','')

In [62]:
# Converting the "duration (seconds)" column's values to numeric
converted_ufo_df = clean_ufo_df.copy()
converted_ufo_df["duration (seconds)"] = converted_ufo_df.loc[:,"duration (seconds)"].astype(float)

In [32]:
converted_ufo_df.dtypes

#converted_ufo_df.head()

datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)      float64
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object

In [34]:
# Filter the data so that only those sightings in the US are in a DataFrame
usa_ufo_df = converted_ufo_df.loc[converted_ufo_df["country"] == "us", :]
usa_ufo_df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
3,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889
7,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333


In [35]:
# Count how many sightings have occured within each state
 = usa_ufo_df["state_countsstate"].value_counts()
state_counts.head()

ca    8912
wa    3966
fl    3835
tx    3447
ny    2980
Name: state, dtype: int64

In [36]:
# Using GroupBy in order to separate the data into fields according to "state" values
grouped_usa_df = usa_ufo_df.groupby(['state'])

# The object returned is a "GroupBy" object and cannot be viewed normally...
print(grouped_usa_df)

# In order to be visualized, a data function must be used...
grouped_usa_df.count().head(10)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015A475AB308>


Unnamed: 0_level_0,datetime,city,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ak,319,319,319,311,268,319,319,319,319,319
al,642,642,642,629,564,642,642,642,642,642
ar,588,588,588,578,496,588,588,588,588,588
az,2414,2414,2414,2362,2017,2414,2414,2414,2414,2414
ca,8912,8912,8912,8684,7392,8912,8911,8912,8912,8912
co,1413,1413,1413,1385,1152,1413,1413,1413,1413,1413
ct,892,892,892,866,715,892,891,892,892,892
dc,7,7,7,7,5,7,7,7,7,7
de,166,166,166,165,138,166,166,166,166,166
fl,3835,3835,3835,3754,3350,3835,3835,3835,3835,3835


In [43]:
grouped_usa_df["duration (seconds)"].sum()

state
ak      951000.00
al      840390.00
ar      650848.50
az    15081472.90
ca    27567925.47
co     4122467.10
ct    12464112.30
dc         925.50
de      118810.50
fl     2929453.70
ga     9336854.10
hi     6692643.00
ia      513936.50
id      400609.30
il     1827708.57
in     4044247.70
ks      781978.50
ky     3234916.00
la     6731708.00
ma     1303291.00
md      602119.80
me     1678491.90
mi     6784468.60
mn      722355.33
mo     1225695.50
ms     3345134.00
mt      988173.00
nc     1750460.35
nd       92009.00
ne      274427.00
nh      991717.00
nj     7637736.00
nm     3070128.59
nv     2256357.00
ny     8360372.25
oh     2546152.60
ok      730854.30
or     1545188.77
pa     9450553.50
pr       26247.00
ri      372417.50
sc      961113.30
sd      443549.50
tn     1709676.30
tx     3290004.50
ut     3343923.50
va     2979355.00
vt      238139.50
wa    55918853.14
wi     2101089.00
wv     2908397.00
wy      195894.50
Name: duration (seconds), dtype: float64

In [41]:
# Since "duration (seconds)" was converted to a numeric time, it can now be summed up per state
state_duration = grouped_usa_df["duration (seconds)"].sum()
state_duration.head()

state
ak      951000.00
al      840390.00
ar      650848.50
az    15081472.90
ca    27567925.47
Name: duration (seconds), dtype: float64

In [42]:
# Creating a new DataFrame using both duration and count
state_summary_df = pd.DataFrame({"Number of Sightings": state_counts,
                                    "Total Visit Time": state_duration})
state_summary_df.head()

Unnamed: 0,Number of Sightings,Total Visit Time
ak,319,951000.0
al,642,840390.0
ar,588,650848.5
az,2414,15081472.9
ca,8912,27567925.47


In [45]:
# It is also possible to group a DataFrame by multiple columns
# This returns an object with multiple indexes, however, which can be harder to deal with
grouped_international_data = converted_ufo_df.groupby(['country', 'state'])

grouped_international_data.count().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,city,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
country,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
au,al,1,1,1,1,1,1,1,1,1
au,dc,1,1,1,1,1,1,1,1,1
au,nt,2,2,2,2,2,2,2,2,2
au,oh,1,1,1,0,1,1,1,1,1
au,sa,2,2,2,1,2,2,2,2,2
au,wa,2,2,2,2,2,2,2,2,2
au,yt,1,1,1,1,1,1,1,1,1
ca,ab,288,288,284,221,288,288,288,288,288
ca,bc,691,691,677,545,691,691,691,691,691
ca,mb,127,127,124,98,127,127,127,127,127


In [47]:
# Converting a GroupBy object into a DataFrame
international_duration_df = pd.DataFrame(
    grouped_international_data["duration (seconds)"].sum())
international_duration_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds)
country,state,Unnamed: 2_level_1
au,al,900.0
au,dc,300.0
au,nt,360.0
au,oh,0.0
au,sa,300.0
au,wa,450.0
au,yt,30.0
ca,ab,212131.0
ca,bc,540608.8
ca,mb,554161.0
