## Import modules

In [None]:
import pandas as pd

## Read in and Inspect Data

In [None]:
moon_df = pd.read_csv('full-moon-calendar-1900-2050/full_moon.csv')
moon_df.head()

In [None]:
births1_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_1994-2003_CDC_NCHS.csv')
births1_df.head()

In [None]:
births2_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_2000-2014_SSA.csv')
births2_df.head()

## Clean Data
- Check each dataframe for null values


In [None]:
births1_df[births1_df.isna().any(axis=1)]

In [None]:
births2_df[births2_df.isna().any(axis=1)]

In [None]:
moon_df[moon_df.isna().any(axis=1)]

- Check for / eliminate extra whitespace

In [None]:
# Confirm data in births datasets are type int, meaning no whitespace should be present

births1_df.dtypes

In [None]:
births2_df.dtypes

In [None]:
# The moon dataframe has strings, so str.strip can be applied

moon_df['Day'].str.strip()

In [None]:
# For date and time columns, we only want to remove any leading or trailing whitespace

moon_df['Date'].str.lstrip()

In [None]:
moon_df['Date'].str.rstrip()

In [None]:
moon_df['Time'].str.lstrip()

In [None]:
moon_df['Time'].str.rstrip()

## Combining birth datasets

In [42]:
# Combine birth data into one dataframe
births_df = pd.concat([births1_df, births2_df], ignore_index=True)

In [43]:
births_df.shape

(9131, 5)

In [44]:
# Dropping duplicate dates since the datasets overlap from 2000-2003

births_df.drop_duplicates(subset=['year', 'month', 'date_of_month'], keep='last', inplace=True)

In [45]:
# Verify new number of rows

births_df.shape

(7670, 5)

## Preparing full moon dataset to merge with births

- Full moon dataset should have separate columns for month, day, and year, for easier merging with births

In [46]:
moon_df.columns

Index(['Day', 'Date', 'Time'], dtype='object')

In [47]:
# Create date series from moon_df

moon_date = moon_df.loc[:, 'Date']
moon_date.head()

0      15 January 1900
1     14 February 1900
2        16 March 1900
3        15 April 1900
4          14 May 1900
Name: Date, dtype: object

In [48]:
# Split date into 3 columns

moon_date_df = moon_date.str.split(expand=True)
moon_date_df.head()

Unnamed: 0,0,1,2
0,15,January,1900
1,14,February,1900
2,16,March,1900
3,15,April,1900
4,14,May,1900


In [49]:
# Renaming columns

moon_date_df.rename(columns={0: 'date_of_month', 1: 'month', 2: 'year'}, inplace=True)

In [50]:
# Merging back to moon_df

dates_moon_df = moon_df.merge(moon_date_df, left_index=True, right_index=True)
dates_moon_df.head()

Unnamed: 0,Day,Date,Time,date_of_month,month,year
0,Monday,15 January 1900,08:07:30 pm,15,January,1900
1,Wednesday,14 February 1900,02:50:12 pm,14,February,1900
2,Friday,16 March 1900,09:11:48 am,16,March,1900
3,Sunday,15 April 1900,02:02:06 am,15,April,1900
4,Monday,14 May 1900,04:36:36 pm,14,May,1900


## Preparing birth data for merge
 
- For future use in Tableau, which recognizes date data, there should be a month column with the name of the month rather than a number.