## Import modules

In [None]:
import pandas as pd

## Read in and Inspect Data

In [None]:
moon_df = pd.read_csv('full-moon-calendar-1900-2050/full_moon.csv')
moon_df.head()

In [None]:
births1_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_1994-2003_CDC_NCHS.csv')
births1_df.head()

In [None]:
births2_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_2000-2014_SSA.csv')
births2_df.head()

## Combining birth datasets

In [None]:
# Combine birth data into one dataframe
births_df = pd.concat([births1_df, births2_df], ignore_index=True)

In [None]:
births_df.shape

In [None]:
# Dropping duplicate dates since the datasets overlap from 2000-2003

births_df.drop_duplicates(subset=['year', 'month', 'date_of_month'], keep='last', inplace=True)

In [None]:
births_df.shape

## Preparing full moon dataset to merge with births

- Full moon dataset should have separate columns for month, day, and year, for easier merging with births

In [35]:
moon_df.columns

Index(['Day', ' Date', ' Time'], dtype='object')

In [38]:
# There is an extra space in front of the word 'Date' in the column, so rename

moon_df.rename(columns={' Date': 'Date', ' Time': 'Time'}, inplace=True)

In [39]:
# Create date series from moon_df

moon_date = moon_df.loc[:, 'Date']
moon_date.head()

0      15 January 1900
1     14 February 1900
2        16 March 1900
3        15 April 1900
4          14 May 1900
Name: Date, dtype: object

In [None]:
# Split date into 3 columns

moon_date_df = moon_date.str.split(expand=True)
moon_date_df.head()

In [40]:
# Renaming columns

moon_date_df.rename(columns={0: 'date_of_month', 1: 'month', 2: 'year'}, inplace=True)

In [41]:
# Merging back to moon_df

dates_moon_df = moon_df.merge(moon_date_df, left_index=True, right_index=True)
dates_moon_df.head()

Unnamed: 0,Day,Date,Time,date_of_month,month,year
0,Monday,15 January 1900,08:07:30 pm,15,January,1900
1,Wednesday,14 February 1900,02:50:12 pm,14,February,1900
2,Friday,16 March 1900,09:11:48 am,16,March,1900
3,Sunday,15 April 1900,02:02:06 am,15,April,1900
4,Monday,14 May 1900,04:36:36 pm,14,May,1900


In [42]:
# Dropping original date column

dates_moon_df.drop(['Date'], axis=1, inplace=True)
dates_moon_df.head()

Unnamed: 0,Day,Time,date_of_month,month,year
0,Monday,08:07:30 pm,15,January,1900
1,Wednesday,02:50:12 pm,14,February,1900
2,Friday,09:11:48 am,16,March,1900
3,Sunday,02:02:06 am,15,April,1900
4,Monday,04:36:36 pm,14,May,1900


## Preparing birth data for merge
 
- For future use in Tableau, which recognizes date data, the month column should have the name of the month rather than a number.