## Import modules

In [None]:
import pandas as pd
from datetime import datetime

## Read in and Inspect Data

In [None]:
moon_df = pd.read_csv('full-moon-calendar-1900-2050/full_moon.csv')
births1_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_1994-2003_CDC_NCHS.csv')
births2_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_2000-2014_SSA.csv')

print('moon_df:\n', moon_df.head(),
    '\n\nbirths1_df:\n', births1_df.head(),
    '\n\nbirths2_df:\n', births2_df.head()
)

## Clean Data

### Check dataframes for null values


In [None]:
print(
    'births1_df nulls:\n',
    births1_df[births1_df.isna().any(axis=1)],
    '\n\nbirths2_df nulls:\n',
    births2_df[births2_df.isna().any(axis=1)],
    '\n\nmoon_df nulls:\n',
    moon_df[moon_df.isna().any(axis=1)]
)

- No null values found

### Check data types and eliminate any extra whitespace

In [None]:
# Check data types

print(
    'births1_df dtypes:\n', births1_df.dtypes, 
    '\n\nbirths2_df dtypes:\n', births2_df.dtypes,
    '\n\nmoon_df dtypes:\n', moon_df.dtypes
)

In [None]:
# It looks there's some extra whitespace in the column names, let's check it out

print(
    'births1_df:\n', births1_df.columns,
    '\n\nbirths2_df:\n', births2_df.columns,
    '\n\nmoon_df:\n', moon_df.columns
)

In [None]:
# There's definitely leading whitespace in front of the moon_df column names - perform strip and view results

moon_df.columns = moon_df.columns.str.lstrip()

moon_df.columns

In [None]:
# Remove any leading and trailing whitespace that might be present from the Day column in moon_df

moon_df['Day'].str.strip()

- Everything looks as expected

### Correct dates to YYYY-MM-DD format in all 3 datasets

In [None]:
# Start with moon_df, printing dtypes again to confirm successfully converted to datetime

moon_df['Date'] = pd.to_datetime(moon_df['Date'])

print(moon_df.dtypes, moon_df.head(), sep='\n\n')

In [None]:
# Next do the birth datasets and check results

births1_df['Date'] = pd.to_datetime(births1_df.year.astype(str) + '-' + births1_df.month.astype(str) + '-' + births1_df.date_of_month.astype(str))
births2_df['Date'] = pd.to_datetime(births2_df.year.astype(str) + '-' + births2_df.month.astype(str) + '-' + births2_df.date_of_month.astype(str))

print(births1_df.head(3), births1_df.dtypes, sep='\n\n')

In [None]:
print(births2_df.head(3), births2_df.dtypes, sep='\n\n')

### Combine birth datasets

In [42]:
# Combine birth data into one dataframe

births_df = pd.concat([births1_df, births2_df], ignore_index=True)
births_df

Unnamed: 0,year,month,date_of_month,day_of_week,births,Date
0,1994,1,1,6,8096,1994-01-01
1,1994,1,2,7,7772,1994-01-02
2,1994,1,3,1,10142,1994-01-03
3,1994,1,4,2,11248,1994-01-04
4,1994,1,5,3,11053,1994-01-05
...,...,...,...,...,...,...
9126,2014,12,27,6,8656,2014-12-27
9127,2014,12,28,7,7724,2014-12-28
9128,2014,12,29,1,12811,2014-12-29
9129,2014,12,30,2,13634,2014-12-30


- Drop duplicate dates since the datasets overlap from 2000-2003

In [43]:
# Check the starting number of rows

births_df.shape

(9131, 6)

In [44]:
# Drop rows that match on date and verify that rows have been dropped

births_df.drop_duplicates(subset=['Date'], keep='last', inplace=True)

births_df.shape

(7670, 6)

### Replace day of week numbers with names in birth dataset

- We may want to look at births on different days of the week as part of our analysis

In [45]:
# According to kaggle documentation, 1 is Monday and 7 is Sunday

day_names_dict = {
                    1: 'Monday',
                    2: 'Tuesday',
                    3: 'Wednesday',
                    4: 'Thursday',
                    5: 'Friday',
                    6: 'Saturday',
                    7: 'Sunday'
                    }

In [54]:
# Create series of day numbers from births_df

day_numbers = births_df['day_of_week']

In [55]:
# Create day names series using series.map()

day_names = day_numbers.map(day_names_dict)

In [56]:
# Adding day names series to births_df using same column name as moon_df

births_df['Day'] = day_names
births_df

Unnamed: 0,year,month,date_of_month,day_of_week,births,Date,Day
0,1994,1,1,6,8096,1994-01-01,Saturday
1,1994,1,2,7,7772,1994-01-02,Sunday
2,1994,1,3,1,10142,1994-01-03,Monday
3,1994,1,4,2,11248,1994-01-04,Tuesday
4,1994,1,5,3,11053,1994-01-05,Wednesday
...,...,...,...,...,...,...,...
9126,2014,12,27,6,8656,2014-12-27,Saturday
9127,2014,12,28,7,7724,2014-12-28,Sunday
9128,2014,12,29,1,12811,2014-12-29,Monday
9129,2014,12,30,2,13634,2014-12-30,Tuesday


## Create final dataset
### Merge birth and moon datasets

In [52]:
# Use Date columns and keep everything in births_df, with only matching rows from moon_df

babymoons_df = pd.merge(births_df, moon_df, how = 'left', on = ['Date', 'Day'])
babymoons_df

Unnamed: 0,year,month,date_of_month,day_of_week,births,Date,Day,Time
0,1994,1,1,6,8096,1994-01-01,Saturday,
1,1994,1,2,7,7772,1994-01-02,Sunday,
2,1994,1,3,1,10142,1994-01-03,Monday,
3,1994,1,4,2,11248,1994-01-04,Tuesday,
4,1994,1,5,3,11053,1994-01-05,Wednesday,
...,...,...,...,...,...,...,...,...
7665,2014,12,27,6,8656,2014-12-27,Saturday,
7666,2014,12,28,7,7724,2014-12-28,Sunday,
7667,2014,12,29,1,12811,2014-12-29,Monday,
7668,2014,12,30,2,13634,2014-12-30,Tuesday,


In [53]:
# Confirming that the number of rows equals that of births_df

babymoons_df.shape

(7670, 8)

### Drop unnecessary columns

In [57]:
babymoons_df.drop(['year', 'month', 'date_of_month', 'day_of_week'], axis=1)

Unnamed: 0,births,Date,Day,Time
0,8096,1994-01-01,Saturday,
1,7772,1994-01-02,Sunday,
2,10142,1994-01-03,Monday,
3,11248,1994-01-04,Tuesday,
4,11053,1994-01-05,Wednesday,
...,...,...,...,...
7665,8656,2014-12-27,Saturday,
7666,7724,2014-12-28,Sunday,
7667,12811,2014-12-29,Monday,
7668,13634,2014-12-30,Tuesday,


### Indicate where full moons occur

- The Time column has been retained from moons_df, so any entry there that is not NaN represents a full moon