In [1]:
import pandas as pd
import datetime as datetime

# Import Weather Data
Two of the prospective features of the algorithm will be temperature and rainfall data. This information will be downloaded from the National Centers for Environmental Information then merged to the cleaned data file from step 1.

In [2]:
weather = pd.read_csv('weather_data.csv')
weather.head()

Unnamed: 0,Date,TAVG (Degrees Fahrenheit),TMAX (Degrees Fahrenheit),TMIN (Degrees Fahrenheit),PRCP (Inches),SNOW (Inches),SNWD (Inches)
0,1936-01-01,,37.0,17.0,0.0,0.0,3.0
1,1936-01-02,,41.0,25.0,0.01,0.0,1.0
2,1936-01-03,,37.0,34.0,0.0,0.0,0.0
3,1936-01-04,,39.0,28.0,0.08,0.0,0.0
4,1936-01-05,,28.0,18.0,0.0,0.0,0.0


First, let's drop the columns we don't need for our purposes.

In [3]:
#Create list of columns to drop
cols_to_drop = ['TAVG (Degrees Fahrenheit)', 'TMIN (Degrees Fahrenheit)', 'SNWD (Inches)']

#Drop columns
weather = weather.drop(columns=cols_to_drop)

weather.head()

Unnamed: 0,Date,TMAX (Degrees Fahrenheit),PRCP (Inches),SNOW (Inches)
0,1936-01-01,37.0,0.0,0.0
1,1936-01-02,41.0,0.01,0.0
2,1936-01-03,37.0,0.0,0.0
3,1936-01-04,39.0,0.08,0.0
4,1936-01-05,28.0,0.0,0.0


In [4]:
#Check data types
weather.dtypes

Date                          object
TMAX (Degrees Fahrenheit)    float64
PRCP (Inches)                float64
SNOW (Inches)                float64
dtype: object

The data column needs to be adjusted to match the datetime dtype and the dd/mm/yy format of the previous section.

In [5]:
#Change Date to datetime format
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')

#Change date format from YYYY-MM-DD to MM-DD-YY
weather['Date'] = weather['Date'].dt.strftime('%m/%d/%y')

Let's rename those column names to be a little more display friendly.

In [6]:
weather = weather.rename(columns={'TMAX (Degrees Fahrenheit)':'Max_Temp_Degrees', 'PRCP (Inches)':'Rain_Inches', 'SNOW (Inches)': 'Snow_Inches'})
weather.head()

Unnamed: 0,Date,Max_Temp_Degrees,Rain_Inches,Snow_Inches
0,01/01/36,37.0,0.0,0.0
1,01/02/36,41.0,0.01,0.0
2,01/03/36,37.0,0.0,0.0
3,01/04/36,39.0,0.08,0.0
4,01/05/36,28.0,0.0,0.0


# Import Production Data and Merge Files

In [7]:
production = pd.read_csv('cleaned_data')
production = production.drop(columns={'Unnamed: 0'})
#Not sure why that column came through. It was absent prior to the export on the last notebook.
production.head()

Unnamed: 0,Batch,Date,Weight,NV,Visc-Ford,BPA_level,Amine
0,24020141,03/08/24,8.46,20.97,16.2,36.49,0.0089
1,24020162,03/07/24,8.43,20.91,15.5,36.49,0.0089
2,24020164,03/07/24,8.46,20.91,15.7,36.49,0.0089
3,24020143,03/06/24,8.44,20.81,17.0,36.49,0.0089
4,24020140,03/06/24,8.44,20.67,15.9,36.49,0.0089


The production dataframe has some instances of multiple rows for a single date. So, in the merge, we will list it first and apply the how='left' method. First, let's remind ourselves of the number of rows in production.

In [8]:
production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274 entries, 0 to 2273
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Batch      2274 non-null   int64  
 1   Date       2274 non-null   object 
 2   Weight     2274 non-null   float64
 3   NV         2274 non-null   float64
 4   Visc-Ford  2274 non-null   float64
 5   BPA_level  2274 non-null   float64
 6   Amine      2274 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 124.5+ KB


In [9]:
merged_data = pd.merge(production, weather, on='Date', how='left')
merged_data

Unnamed: 0,Batch,Date,Weight,NV,Visc-Ford,BPA_level,Amine,Max_Temp_Degrees,Rain_Inches,Snow_Inches
0,24020141,03/08/24,8.46,20.97,16.2,36.490,0.00890,56.0,0.18,
1,24020162,03/07/24,8.43,20.91,15.5,36.490,0.00890,43.0,0.12,
2,24020164,03/07/24,8.46,20.91,15.7,36.490,0.00890,43.0,0.12,
3,24020143,03/06/24,8.44,20.81,17.0,36.490,0.00890,41.0,0.01,
4,24020140,03/06/24,8.44,20.67,15.9,36.490,0.00890,41.0,0.01,
...,...,...,...,...,...,...,...,...,...,...
2269,6060084,02/23/68,8.43,21.45,15.6,35.875,0.00875,29.0,0.00,0.0
2270,6050047,12/12/39,8.45,21.56,19.0,35.875,0.00875,52.0,0.04,0.0
2271,6030008,02/08/87,8.46,21.15,17.2,35.875,0.00875,44.0,0.00,0.0
2272,6030007,11/30/84,8.45,21.30,17.2,35.875,0.00875,48.0,0.00,0.0


# Deal with weather NaN values

In [10]:
merged_data.isna().sum()

Batch                  0
Date                   0
Weight                 0
NV                     0
Visc-Ford              0
BPA_level              0
Amine                  0
Max_Temp_Degrees       9
Rain_Inches           54
Snow_Inches         2231
dtype: int64

That's a lot of NaN values for snow, so we'll drop that column. Rain and max are small enough values for us to drop rows that have NaNs in those columns

In [11]:
#Drop Snow_Inches
merged_data = merged_data.drop(columns=['Snow_Inches'])

#Drop Max_Temp_Degrees and Snow_Inches where there are NaN values
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,Batch,Date,Weight,NV,Visc-Ford,BPA_level,Amine,Max_Temp_Degrees,Rain_Inches
0,24020141,03/08/24,8.46,20.97,16.2,36.490,0.00890,56.0,0.18
1,24020162,03/07/24,8.43,20.91,15.5,36.490,0.00890,43.0,0.12
2,24020164,03/07/24,8.46,20.91,15.7,36.490,0.00890,43.0,0.12
3,24020143,03/06/24,8.44,20.81,17.0,36.490,0.00890,41.0,0.01
4,24020140,03/06/24,8.44,20.67,15.9,36.490,0.00890,41.0,0.01
...,...,...,...,...,...,...,...,...,...
2268,6070122,03/02/97,8.42,20.90,15.2,35.875,0.00875,65.0,0.40
2269,6060084,02/23/68,8.43,21.45,15.6,35.875,0.00875,29.0,0.00
2270,6050047,12/12/39,8.45,21.56,19.0,35.875,0.00875,52.0,0.04
2271,6030008,02/08/87,8.46,21.15,17.2,35.875,0.00875,44.0,0.00


Our merged dataframe is complete. All of the NaNs have been managed. The dates are in the proper format. We are ready to move on to running descriptive statistics and producing visualizations in the next notebook.

In [12]:
merged_data.to_csv('merged_dataset')