## Import modules

In [1]:
import pandas as pd

## Read in and Inspect Data

In [2]:
moon_df = pd.read_csv('full-moon-calendar-1900-2050/full_moon.csv')
moon_df.head()

Unnamed: 0,Day,Date,Time
0,Monday,15 January 1900,08:07:30 pm
1,Wednesday,14 February 1900,02:50:12 pm
2,Friday,16 March 1900,09:11:48 am
3,Sunday,15 April 1900,02:02:06 am
4,Monday,14 May 1900,04:36:36 pm


In [3]:
births1_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_1994-2003_CDC_NCHS.csv')
births1_df.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,1994,1,1,6,8096
1,1994,1,2,7,7772
2,1994,1,3,1,10142
3,1994,1,4,2,11248
4,1994,1,5,3,11053


In [4]:
births2_df = pd.read_csv('fivethirtyeight-births-dataset/US_births_2000-2014_SSA.csv')
births2_df.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,2000,1,1,6,9083
1,2000,1,2,7,8006
2,2000,1,3,1,11363
3,2000,1,4,2,13032
4,2000,1,5,3,12558


## Clean Data
- Check each dataframe for null values


In [5]:
births1_df[births1_df.isna().any(axis=1)]

Unnamed: 0,year,month,date_of_month,day_of_week,births


In [6]:
births2_df[births2_df.isna().any(axis=1)]

Unnamed: 0,year,month,date_of_month,day_of_week,births


In [7]:
moon_df[moon_df.isna().any(axis=1)]

Unnamed: 0,Day,Date,Time


- No null values found
- Check for / eliminate extra whitespace

In [8]:
# Confirm data in births datasets are type int, meaning no whitespace should be present

births1_df.dtypes

year             int64
month            int64
date_of_month    int64
day_of_week      int64
births           int64
dtype: object

In [9]:
births2_df.dtypes

year             int64
month            int64
date_of_month    int64
day_of_week      int64
births           int64
dtype: object

In [10]:
# The moon dataframe has strings, so str.strip can be applied

moon_df['Day'].str.strip()

0          Monday
1       Wednesday
2          Friday
3          Sunday
4          Monday
          ...    
1863     Thursday
1864       Friday
1865       Sunday
1866       Monday
1867    Wednesday
Name: Day, Length: 1868, dtype: object

In [11]:
# Trying to strip the Date columns returned a keyerror for "Date", so inspecting columns names

moon_df.columns

Index(['Day', ' Date', ' Time'], dtype='object')

In [12]:
# Columns names Date and Time have extra whitespace, so need to remove that in order to operate on those columns

moon_df.columns = moon_df.columns.str.lstrip()
moon_df.columns

Index(['Day', 'Date', 'Time'], dtype='object')

In [13]:
# For date and time columns, we only want to remove any leading or trailing whitespace

moon_df['Date'].str.strip()

0         15 January 1900
1        14 February 1900
2           16 March 1900
3           15 April 1900
4             14 May 1900
              ...        
1863     1 September 2050
1864    30 September 2050
1865      30 October 2050
1866     28 November 2050
1867     28 December 2050
Name: Date, Length: 1868, dtype: object

In [14]:
moon_df['Time'].str.strip()

0       08:07:30 pm
1       02:50:12 pm
2       09:11:48 am
3       02:02:06 am
4       04:36:36 pm
           ...     
1863    10:30:54 am
1864    06:31:48 pm
1865    04:16:00 am
1866    04:09:48 pm
1867    06:15:36 am
Name: Time, Length: 1868, dtype: object

## Combining birth datasets

In [15]:
# Combine birth data into one dataframe
births_df = pd.concat([births1_df, births2_df], ignore_index=True)

- Need to drop duplicate dates since the datasets overlap from 2000-2003

In [16]:
# Check the starting number of rows

births_df.shape

(9131, 5)

In [17]:
# Drop rows that match on all three date components

births_df.drop_duplicates(subset=['year', 'month', 'date_of_month'], keep='last', inplace=True)

In [18]:
# Verify new number of rows

births_df.shape

(7670, 5)

In [19]:
births_df.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,1994,1,1,6,8096
1,1994,1,2,7,7772
2,1994,1,3,1,10142
3,1994,1,4,2,11248
4,1994,1,5,3,11053


## Preparing birth data for merge with moon data
 
- For future use in Tableau, which recognizes date data, there should be a month column with the name of the month instead of a number.
- This will also make the month names the same as in moon_df

In [20]:
# Create a dictionary of month numbers and names

month_name_dict = {
                1: 'January', 
                2: 'February', 
                3: 'March', 
                4: 'April', 
                5: 'May', 
                6:'June',
                7: 'July',
                8: 'August',
                9: 'September',
                10: 'October',
                11: 'November',
                12: 'December'
                }

In [21]:
# Create series of month names so we can use series.map()

month_numbers = births_df['month']

In [22]:
# Map month names to numbers

month_names = month_numbers.map(month_name_dict)

In [23]:
# Add this series as a new column in births_df

births_df['month_name'] = month_names

In [24]:
# Inspect results

births_df

Unnamed: 0,year,month,date_of_month,day_of_week,births,month_name
0,1994,1,1,6,8096,January
1,1994,1,2,7,7772,January
2,1994,1,3,1,10142,January
3,1994,1,4,2,11248,January
4,1994,1,5,3,11053,January
...,...,...,...,...,...,...
9126,2014,12,27,6,8656,December
9127,2014,12,28,7,7724,December
9128,2014,12,29,1,12811,December
9129,2014,12,30,2,13634,December


- Do the same thing with day_of_week column to have day names as well as numbers, for possible births by day of week analysis in Tableau

In [25]:
# According to kaggle documentation, 1 is Monday and 7 is Sunday

day_names_dict = {
                    1: 'Monday',
                    2: 'Tuesday',
                    3: 'Wednesday',
                    4: 'Thursday',
                    5: 'Friday',
                    6: 'Saturday',
                    7: 'Sunday'
                    }

In [27]:
# Create series of day numbers from births_df

day_numbers = births_df['day_of_week']
day_numbers

0       6
1       7
2       1
3       2
4       3
       ..
9126    6
9127    7
9128    1
9129    2
9130    3
Name: day_of_week, Length: 7670, dtype: int64

In [28]:
# Create day names series using series.map()

day_names = day_numbers.map(day_names_dict)

In [29]:
# Adding day names series to births_df using same column name as moon_df

births_df['Day'] = day_names
births_df

Unnamed: 0,year,month,date_of_month,day_of_week,births,month_name,Day
0,1994,1,1,6,8096,January,Saturday
1,1994,1,2,7,7772,January,Sunday
2,1994,1,3,1,10142,January,Monday
3,1994,1,4,2,11248,January,Tuesday
4,1994,1,5,3,11053,January,Wednesday
...,...,...,...,...,...,...,...
9126,2014,12,27,6,8656,December,Saturday
9127,2014,12,28,7,7724,December,Sunday
9128,2014,12,29,1,12811,December,Monday
9129,2014,12,30,2,13634,December,Tuesday


## Preparing full moon dataset to merge with births

- Full moon dataset should have separate columns for month, day, and year, for easier merging with births

In [None]:
moon_df.columns

In [None]:
# Create date series from moon_df

moon_date = moon_df.loc[:, 'Date']
moon_date.head()

In [None]:
# Split date into 3 columns

moon_date_df = moon_date.str.split(expand=True)
moon_date_df.head()

In [None]:
# Renaming columns

moon_date_df.rename(columns={0: 'date_of_month', 1: 'month', 2: 'year'}, inplace=True)

In [None]:
# Merging new columns back to moon_df

moon_df = moon_df.merge(moon_date_df, left_index=True, right_index=True)
moon_df.head()