# UFO Research Outcome predictor

We need to build a model (XGBoost/LinearLearner) that predicts on the `researchOutcome` column with 90% accuracy 

In [68]:
!pip install s3fs boto3



In [253]:
import boto3
import sagemaker
import sagemaker.amazon.common as smac

import io
import random
import numpy as np
import pandas as pd
import urllib.request
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [208]:
df = pd.read_csv('s3://ufo-sighting-bucket/ufo_fullset.csv', low_memory=False)
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [209]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 15 columns):
reportedTimestamp    18000 non-null object
eventDate            18000 non-null object
eventTime            18000 non-null object
shape                17998 non-null object
duration             18000 non-null int64
witnesses            18000 non-null int64
weather              18000 non-null object
firstName            18000 non-null object
lastName             18000 non-null object
latitude             18000 non-null float64
longitude            18000 non-null float64
sighting             18000 non-null object
physicalEvidence     18000 non-null object
contact              18000 non-null object
researchOutcome      18000 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 2.1+ MB


## Missing Values

replacing missing shape with mode shape

In [210]:
df[df.isnull().any(axis=1)]

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
1024,2011-03-23T18:32:20.473Z,2011-03-22,21:12,,3,1,rain,Deon,Feil,37.681944,-121.766944,Y,N,N,explained
2048,1998-04-23T18:47:16.029Z,1998-04-23,10:07,,40,2,partly cloudy,Vincenzo,Rohan,38.254167,-85.759444,Y,Y,N,explained


In [211]:
df['shape'].value_counts().index[0]

'circle'

In [212]:
df['shape'] = df['shape'].fillna(df['shape'].value_counts().index[0])

## Column Cleaning

In [213]:
df['eventDateTime'] = df['eventDate'].astype(str) + ' ' + df['eventTime'].astype(str)
drop_cols = ['eventDate', 'eventTime']
for dp in drop_cols:
    df.drop(dp, axis=1, inplace=True)
df.head()

Unnamed: 0,reportedTimestamp,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome,eventDateTime
0,1977-04-04T04:02:23.340Z,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained,1977-03-31 23:46
1,1982-11-22T02:06:32.019Z,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained,1982-11-15 22:04
2,1992-12-07T19:06:52.482Z,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained,1992-12-07 19:01
3,2011-02-24T21:06:34.898Z,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained,2011-02-21 20:56
4,1991-03-09T16:18:45.501Z,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained,1991-03-09 11:42


In [214]:
dt_cols = ['reportedTimestamp', 'eventDateTime']
for col in dt_cols:
    df[col] = pd.to_datetime(df[col])
df.head()

Unnamed: 0,reportedTimestamp,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome,eventDateTime
0,1977-04-04 04:02:23.340000+00:00,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained,1977-03-31 23:46:00
1,1982-11-22 02:06:32.019000+00:00,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained,1982-11-15 22:04:00
2,1992-12-07 19:06:52.482000+00:00,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained,1992-12-07 19:01:00
3,2011-02-24 21:06:34.898000+00:00,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained,2011-02-21 20:56:00
4,1991-03-09 16:18:45.501000+00:00,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained,1991-03-09 11:42:00


In [215]:
df['sighting'].value_counts()

Y    18000
Name: sighting, dtype: int64

In [216]:
# Dropping irrelavent columns like names.
# sighting gets dropped because it is always 'Y'

drop_cols = ['sighting', 'firstName', 'lastName']
for col in drop_cols:
    df.drop(col, axis=1, inplace=True)
    
df.head()

Unnamed: 0,reportedTimestamp,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,eventDateTime
0,1977-04-04 04:02:23.340000+00:00,circle,4,1,rain,47.329444,-122.578889,N,N,explained,1977-03-31 23:46:00
1,1982-11-22 02:06:32.019000+00:00,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained,1982-11-15 22:04:00
2,1992-12-07 19:06:52.482000+00:00,circle,49,1,clear,38.951667,-92.333889,N,N,explained,1992-12-07 19:01:00
3,2011-02-24 21:06:34.898000+00:00,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained,2011-02-21 20:56:00
4,1991-03-09 16:18:45.501000+00:00,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained,1991-03-09 11:42:00


## LabelEncoder and One-Hot Encoding

In [217]:
# Maybe weather is ordinal? I could see it going either way

In [218]:
# converting 'Y's and 'N's to 1 and 0
y_n_cols = ['physicalEvidence', 'contact']
for col in y_n_cols:
    
    df[col] = df[col].apply(lambda x: 1 if x == 'Y' else 0)
df.head()

Unnamed: 0,reportedTimestamp,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,eventDateTime
0,1977-04-04 04:02:23.340000+00:00,circle,4,1,rain,47.329444,-122.578889,0,0,explained,1977-03-31 23:46:00
1,1982-11-22 02:06:32.019000+00:00,disk,4,1,partly cloudy,52.664913,-1.034894,1,0,explained,1982-11-15 22:04:00
2,1992-12-07 19:06:52.482000+00:00,circle,49,1,clear,38.951667,-92.333889,0,0,explained,1992-12-07 19:01:00
3,2011-02-24 21:06:34.898000+00:00,disk,13,1,partly cloudy,41.496944,-71.367778,0,0,explained,2011-02-21 20:56:00
4,1991-03-09 16:18:45.501000+00:00,circle,17,1,mostly cloudy,47.606389,-122.330833,0,0,explained,1991-03-09 11:42:00


In [219]:
df['researchOutcome'].value_counts()

explained      12822
unexplained     3308
probable        1870
Name: researchOutcome, dtype: int64

In [220]:
target_mapper = {
    'explained': 0,
    'unexplained': 1,
    'probable': 2
}

df['researchOutcome'] = df['researchOutcome'].apply(lambda x: target_mapper[x])

In [221]:
df = pd.merge(left=df, right=pd.get_dummies(df[['shape', 'weather']]), left_index=True, right_index=True, how='inner')
df.drop(['shape', 'weather'], axis=1, inplace=True)
df.head()

Unnamed: 0,reportedTimestamp,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,eventDateTime,shape_box,...,shape_sphere,shape_square,shape_triangle,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,1977-04-04 04:02:23.340000+00:00,4,1,47.329444,-122.578889,0,0,0,1977-03-31 23:46:00,0,...,0,0,0,0,0,0,0,1,0,0
1,1982-11-22 02:06:32.019000+00:00,4,1,52.664913,-1.034894,1,0,0,1982-11-15 22:04:00,0,...,0,0,0,0,0,0,1,0,0,0
2,1992-12-07 19:06:52.482000+00:00,49,1,38.951667,-92.333889,0,0,0,1992-12-07 19:01:00,0,...,0,0,0,1,0,0,0,0,0,0
3,2011-02-24 21:06:34.898000+00:00,13,1,41.496944,-71.367778,0,0,0,2011-02-21 20:56:00,0,...,0,0,0,0,0,0,1,0,0,0
4,1991-03-09 16:18:45.501000+00:00,17,1,47.606389,-122.330833,0,0,0,1991-03-09 11:42:00,0,...,0,0,0,0,0,1,0,0,0,0


In [222]:
df.drop(['reportedTimestamp', 'eventDateTime'], axis=1, inplace=True)
df.head()

Unnamed: 0,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,shape_box,shape_circle,shape_disk,...,shape_sphere,shape_square,shape_triangle,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,4,1,47.329444,-122.578889,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,4,1,52.664913,-1.034894,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,49,1,38.951667,-92.333889,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,13,1,41.496944,-71.367778,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,17,1,47.606389,-122.330833,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 23 columns):
duration                 18000 non-null int64
witnesses                18000 non-null int64
latitude                 18000 non-null float64
longitude                18000 non-null float64
physicalEvidence         18000 non-null int64
contact                  18000 non-null int64
researchOutcome          18000 non-null int64
shape_box                18000 non-null uint8
shape_circle             18000 non-null uint8
shape_disk               18000 non-null uint8
shape_light              18000 non-null uint8
shape_oval               18000 non-null uint8
shape_pyramid            18000 non-null uint8
shape_sphere             18000 non-null uint8
shape_square             18000 non-null uint8
shape_triangle           18000 non-null uint8
weather_clear            18000 non-null uint8
weather_fog              18000 non-null uint8
weather_mostly cloudy    18000 non-null uint8
weather_partl

## Data Splitting

In [224]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = 'ufo-sighting-bucket'
prefix = 'model_output'
bucket_path = f's3://{bucket}/{prefix}'
print(f'model output will be stored here: {bucket_path}')

model output will be stored here: s3://ufo-sighting-bucket/model_output


In [225]:
df.columns

Index(['duration', 'witnesses', 'latitude', 'longitude', 'physicalEvidence',
       'contact', 'researchOutcome', 'shape_box', 'shape_circle', 'shape_disk',
       'shape_light', 'shape_oval', 'shape_pyramid', 'shape_sphere',
       'shape_square', 'shape_triangle', 'weather_clear', 'weather_fog',
       'weather_mostly cloudy', 'weather_partly cloudy', 'weather_rain',
       'weather_snow', 'weather_stormy'],
      dtype='object')

In [229]:
df = df[['researchOutcome', 'duration', 'witnesses', 'latitude', 'longitude', 'physicalEvidence',
       'contact', 'shape_box', 'shape_circle', 'shape_disk',
       'shape_light', 'shape_oval', 'shape_pyramid', 'shape_sphere',
       'shape_square', 'shape_triangle', 'weather_clear', 'weather_fog',
       'weather_mostly cloudy', 'weather_partly cloudy', 'weather_rain',
       'weather_snow', 'weather_stormy']]

In [230]:
# randomizing our data
df = df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(df) * 0.8)
test_samples = int(len(df)*0.1)
val_samples = len(df) - train_samples - test_samples
print(f'''
total_samples: {len(df)}
train_samples: {train_samples}
test_samples: {test_samples}
val_samples: {val_samples}
''')

train = df.sample(train_samples)
test = df[~df.isin(train)].sample(test_samples)
val = df[~df.isin(train) & ~df.isin(test)].sample(val_samples)

assert len(train) == train_samples
assert train.isin(test).values.any() == False
assert train.isin(val).values.any() == False

assert len(test) == test_samples
assert test.isin(train).values.any() == False
assert test.isin(val).values.any() == False

assert len(val) == val_samples
assert val.isin(train).values.any() == False
assert val.isin(test).values.any() == False


total_samples: 18000
train_samples: 14400
test_samples: 1800
val_samples: 1800



In [231]:
train

Unnamed: 0,researchOutcome,duration,witnesses,latitude,longitude,physicalEvidence,contact,shape_box,shape_circle,shape_disk,...,shape_sphere,shape_square,shape_triangle,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
8435,0,37,2,34.147778,-118.143611,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4383,0,9,1,44.636667,-123.104722,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
11519,1,73,8,32.783333,-96.800000,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
9343,0,66,10,39.776389,-74.862778,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
6231,1,4,4,40.028652,-76.517657,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15004,1,19,4,33.808056,-84.170278,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
345,1,52,8,53.970571,-111.689885,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7259,0,72,3,41.701389,-71.155556,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3269,0,79,2,42.069444,-72.615278,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [239]:
train.to_csv('./train.csv', index=None, header=False)
test.to_csv('./test.csv', index=None, header=False)
val.to_csv('./val.csv', index=None, header=False)

In [240]:
s3_client = boto3.client('s3')

In [241]:
s3_client.upload_file('./train.csv', bucket, 'data/train.csv')
s3_client.upload_file('./test.csv', bucket, 'data/test.csv')
s3_client.upload_file('./val.csv', bucket, 'data/val.csv')

## XGBoosting

In [242]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

s3_input_train = sagemaker.s3_input(s3_data=f's3://{bucket}/data/train.csv', content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data=f's3://{bucket}/data/test.csv', content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data=f's3://{bucket}/data/val.csv', content_type='csv')

	get_image_uri(region, 'xgboost', '0.90-1').


In [243]:
job_name = 'ufo-xgboost-job-{}'.format(datetime.now().strftime('%Y%m%d%H%M%S'))
print(f'Job Name: {job_name}')

s3_output = f's3://{bucket}/model_output'

Job Name: ufo-xgboost-job-20200327184446


In [244]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(
    container, 
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge', 
    output_path=s3_output, 
    sagemaker_session=sess
)

xgb.set_hyperparameters(objective='multi:softmax', num_class=3, num_round=50)

data_channels = {
    'train': s3_input_train,
    'validation': s3_input_val,
}

xgb.fit(data_channels, job_name=job_name)

2020-03-27 18:44:48 Starting - Starting the training job...
2020-03-27 18:44:49 Starting - Launching requested ML instances.........
2020-03-27 18:46:19 Starting - Preparing the instances for training...
2020-03-27 18:47:12 Downloading - Downloading input data...
2020-03-27 18:47:49 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2020-03-27:18:47:50:INFO] Running standalone xgboost training.[0m
[34m[2020-03-27:18:47:50:INFO] File size need to be processed in the node: 1.03mb. Available memory size in the node: 8522.8mb[0m
[34m[2020-03-27:18:47:50:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:47:50] S3DistributionType set as FullyReplicated[0m
[34m[18:47:50] 14400x22 matrix with 316800 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-03-27:18:47:50:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:47:50] S3DistributionType set as FullyReplicated[0m
[34m[18

## LinearLearner

In [250]:
X = df.drop('researchOutcome', axis=1)
y = df['researchOutcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=101)

In [251]:
test_samples = len(X_test)
halfway_point = test_samples // 2

X_val = X_test.iloc[halfway_point:]
X_test = X_test[~X_test.isin(X_val)]

y_val = y_test.iloc[halfway_point:]
y_test = y_test[~y_test.isin(y_val)]

In [252]:
X_train = X_train.values.astype('float32')
X_test = X_test.values.astype('float32')
X_val = X_val.values.astype('float32')

y_train = y_train.values.astype('float32')
y_test = y_test.values.astype('float32')
y_val = y_val.values.astype('float32')

In [254]:
train_file = 'ufo_sighting_train_recordIO_protobuf.data'
val_file = 'ufo_sighting_val_recordIO_protobuf.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_train, y_train)
f.seek(0)
s3_client.upload_fileobj(f, bucket, f'data/{train_file}')
train_recordIO_protobuf_location = f's3://{bucket}/data/{train_file}'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_val, y_val)
f.seek(0)
s3_client.upload_fileobj(f, bucket, f'data/{val_file}')
val_recordIO_protobuf_location = f's3://{bucket}/data/{val_file}'

In [255]:
container = get_image_uri(boto3.Session().region_name, 'linear-learner', '1')

In [260]:
job_name = 'ufo-linearlearner-job-{}'.format(datetime.now().strftime('%Y%m%d%H%M%S'))
print(f'Job Name: {job_name}')

s3_output = f's3://{bucket}/linear_model_output'

Job Name: ufo-linearlearner-job-20200327190345


In [261]:
features = len(df.columns)-1

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(
    container, 
    role,
    train_instance_count=1,
    train_instance_type='ml.c4.xlarge',
    output_path=s3_output,
    sagemaker_session=sess,
    input_mode='Pipe'
)

linear.set_hyperparameters(feature_dim=features, predictor_type='multiclass_classifier', num_classes=3)

data_channels = {
    'train': train_recordIO_protobuf_location,
    'validation': val_recordIO_protobuf_location
}
linear.fit(data_channels, job_name=job_name)

2020-03-27 19:03:45 Starting - Starting the training job...
2020-03-27 19:03:46 Starting - Launching requested ML instances......
2020-03-27 19:05:13 Starting - Preparing the instances for training.........
2020-03-27 19:06:40 Downloading - Downloading input data
2020-03-27 19:06:40 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/27/2020 19:06:56 INFO 139900381083456] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_me