# Create the processed dataset

**_Objective:_** Create the dataset to be used to build the Tableau Public dashboard.

Initially I was going to build out a more indepth dashbboard, but I decided to keep it simple so I am pretty much going to drop everything not needed for a map and a few other plots. 

## Required libraries

In [1]:
from datetime import datetime
import os
import pandas as pd

## Bring the data in

In [2]:
os.chdir("../data/interim/")

In [3]:
dat = pd.read_csv("interim_with_county_info_ufo_20230425_0727.csv")

In [4]:
# Remove observations not having a county recorded
dat = dat[dat["county"].notna()]

In [5]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99669 entries, 0 to 103097
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   state                         99669 non-null  object 
 1   date_time                     99669 non-null  object 
 2   shape                         99669 non-null  object 
 3   text                          99669 non-null  object 
 4   city_latitude                 99669 non-null  float64
 5   city_longitude                99669 non-null  float64
 6   date                          99669 non-null  object 
 7   day_of_week                   99669 non-null  object 
 8   weekend_flag                  99669 non-null  object 
 9   zodiac_sign_of_sighting_date  99669 non-null  object 
 10  state.1                       99669 non-null  object 
 11  county                        99669 non-null  object 
dtypes: float64(2), object(10)
memory usage: 9.9+ MB


## Drop fields no longer needed

In [6]:
dat = dat.drop(["date_time",
                "shape",
                "text",
                "city_latitude",
                "city_longitude",
                "day_of_week",
                "weekend_flag", 
                "state"], axis=1)

In [7]:
dat = dat.applymap(lambda s: s.title() if type(s) == str else s)

In [8]:
dat.rename(columns={"state.1": "state"})

Unnamed: 0,date,zodiac_sign_of_sighting_date,state,county
0,2019-06-23,Cancer,Washington,Snohomish
1,2019-06-23,Cancer,Connecticut,New Haven
2,2019-06-20,Gemini,Virginia,Albemarle
3,2019-06-21,Cancer,Michigan,Wayne
4,2019-07-06,Cancer,California,Riverside
...,...,...,...,...
103093,2021-12-15,Sagittarius,New York,Onondaga
103094,2021-12-19,Sagittarius,Connecticut,New Haven
103095,2022-03-09,Pisces,Missouri,St Louis City
103096,1995-07-21,Cancer,Washington,Spokane


## Write the data frame to file

In [9]:
os.chdir("../processed/")

In [10]:
file_name_root = "processed_ufo_data_"
date_time_str = datetime.today().strftime("%Y-%m-%d %H:%M")
date_str = date_time_str.split()[0].replace("-", "") + "_"
time_str = date_time_str.split()[1].replace(":", "")
file_ext = '.csv'
del date_time_str
file_name = file_name_root + date_str + time_str + file_ext
del file_name_root
del date_str
del time_str
del file_ext

In [11]:
print(file_name)

processed_ufo_data_20230429_2012.csv


In [12]:
dat.to_csv(file_name, sep=",", index=False)
del file_name

In [13]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99669 entries, 0 to 103097
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   date                          99669 non-null  object
 1   zodiac_sign_of_sighting_date  99669 non-null  object
 2   state.1                       99669 non-null  object
 3   county                        99669 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB
