In [15]:
import pandas as pd
ufo = pd.read_csv('data/ufo_sightings_large.csv')
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [16]:
# Check the column types
print(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo['date'])

# Check the column types
print(ufo[['seconds','date']].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object
seconds           float64
date       datetime64[ns]
dtype: object


In [17]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [18]:
ufo["date"] = pd.to_datetime(ufo["date"])

ufo["hours"] = ufo["date"].apply(lambda row: row.hour)
ufo["minutes"] = ufo["date"].apply(lambda row: row.hour)

import re

def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo[['length_of_time','minutes']].head())

TypeError: expected string or bytes-like object

In [19]:
import numpy as np

# Check the variance of the seconds and minutes columns
print(ufo[["seconds", "minutes"]].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo[['seconds']])

# Print out the variance of just the seconds_log column
print(ufo["seconds_log"].var())

seconds    3.156735e+10
minutes    5.788400e+01
dtype: float64
nan


In [20]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == 'us' else 0)

# Print the number of unique type values
print(len(ufo["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


In [21]:
# Look at the first 5 rows of the date column
print(ufo['date'].head())

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(
    lambda row: row.month
)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(
    lambda row: row.year
)

# Take a look at the head of all three columns
print(ufo[['date','month','year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
2   2009-09-25 21:00:00
3   2002-11-21 05:45:00
4   2010-08-19 12:55:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Take a look at the head of the desc field
print(ufo["desc"].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo['desc'])

# Look at the number of columns this creates
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
2    Green&#44 red&#44 and blue pulses of light tha...
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
Name: desc, dtype: object


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [25]:
print(ufo[['seconds','seconds_log','minutes']].corr())

# Make a list of features to drop
to_drop = ['city', 'country', 'lat', 'long', 'state', 'date','recorded','desc','seconds','minutes','length_of_time']

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

ufo_dropped.head()

              seconds  seconds_log   minutes
seconds      1.000000     0.164613 -0.022851
seconds_log  0.164613     1.000000 -0.016276
minutes     -0.022851    -0.016276  1.000000


Unnamed: 0,type,hours,seconds_log,country_enc,changing,chevron,cigar,circle,cone,cross,...,light,other,oval,rectangle,sphere,teardrop,triangle,unknown,month,year
0,unknown,19,14.0058,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,11,2011
1,circle,19,3.401197,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,10,2004
2,cigar,21,-inf,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,9,2009
3,triangle,5,5.703782,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,11,2002
4,oval,12,-inf,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,8,2010
