Capstone Snow Depth Project: Pre-processing and training data

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime


Preprocessing and Training Data Development
    Overview
The goal of this notebook is to prepare a clean dataset for model fitting by:
- Creating dummy/indicator features for categorical variables.
- Standardizing the magnitude of numeric features.
- Splitting the dataset into training and testing sets.
- Saving the preprocessed dataset for further use.


In [8]:
#Data
os.chdir('C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-Snow-Prediction/Data')
file_path = 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-Snow-Prediction/Data/snow_data_cleaned.csv'
snow_data = pd.read_csv(file_path)

In [10]:

snow_data.head()

Unnamed: 0,date,station_name,elevation,latitude,longitude,snowdepth,precip_accumulation,precip_increment,airtemp_avg,airtemp_max,...,7d_airtemp_max,7d_precip_max,7d_snowdepth_max,7d_soiltemp_max,30d_airtemp_max,30d_precip_max,30d_snowdepth_max,30d_soiltemp_max,month,year
0,2015-01-01,Brighton,8790,40.59936,-111.58167,34.0,9.2,0.0,17.8,28.2,...,34.2,9.3,34.0,34.0,42.3,11.2,46.0,34.0,1,2015
1,2015-01-02,Brighton,8790,40.59936,-111.58167,33.0,9.2,0.0,19.6,31.8,...,34.2,9.3,34.0,34.0,42.3,11.2,46.0,34.0,1,2015
2,2015-01-03,Brighton,8790,40.59936,-111.58167,32.0,9.2,0.0,19.4,25.5,...,34.2,9.3,34.0,34.0,42.3,11.2,46.0,34.0,1,2015
3,2015-01-04,Brighton,8790,40.59936,-111.58167,31.0,9.2,0.1,26.1,34.9,...,34.2,9.3,34.0,34.0,42.3,11.2,46.0,34.0,1,2015
4,2015-01-05,Brighton,8790,40.59936,-111.58167,31.0,9.3,0.0,34.3,39.0,...,34.2,9.3,34.0,34.0,42.3,11.2,46.0,34.0,1,2015


In [22]:
winter_months = [11, 12, 1, 2, 3, 4, 5]
winter_data = snow_data[snow_data['month'].isin(winter_months)]

In [24]:
#create dummy indicator features for categorical variables 

# Create dummy variables
snow_dummies = pd.get_dummies(winter_data)
print(snow_dummies)


       elevation  latitude  longitude  snowdepth  precip_accumulation  \
0           8790  40.59936 -111.58167       34.0                  9.2   
1           8790  40.59936 -111.58167       33.0                  9.2   
2           8790  40.59936 -111.58167       32.0                  9.2   
3           8790  40.59936 -111.58167       31.0                  9.2   
4           8790  40.59936 -111.58167       31.0                  9.3   
...          ...       ...        ...        ...                  ...   
12424       8490  41.37428 -111.76673       34.0                  8.2   
12425       8490  41.37428 -111.76673       40.0                  9.4   
12426       8490  41.37428 -111.76673       39.0                  9.8   
12427       8490  41.37428 -111.76673       38.0                  9.8   
12428       8490  41.37428 -111.76673       37.0                  9.8   

       precip_increment  airtemp_avg  airtemp_max  airtemp_min  airtemp_obs  \
0                   0.0         17.8        

In [26]:
#Standardize the magnitude of numeric features using scaler
scaler = StandardScaler()

snow_numeric = winter_data[['elevation', 'latitude', 'longitude', 'snowdepth', 'precip_accumulation', 'precip_increment', 'airtemp_avg', 'airtemp_max', 'month', 'year']]
scaled_snow = scaler.fit_transform(snow_numeric)
scaled_snow = pd.DataFrame(scaled_snow, columns=snow_numeric.columns)
print(scaled_snow.head())

   elevation  latitude  longitude  snowdepth  precip_accumulation  \
0   0.626793 -0.365707    0.83746  -0.020057            -0.718354   
1   0.626793 -0.365707    0.83746  -0.057865            -0.718354   
2   0.626793 -0.365707    0.83746  -0.095673            -0.718354   
3   0.626793 -0.365707    0.83746  -0.133481            -0.718354   
4   0.626793 -0.365707    0.83746  -0.133481            -0.709497   

   precip_increment  airtemp_avg  airtemp_max     month      year  
0         -0.515991    -1.171698    -0.982759 -1.116039 -1.659943  
1         -0.515991    -1.007385    -0.680811 -1.116039 -1.659943  
2         -0.515991    -1.025642    -1.209220 -1.116039 -1.659943  
3         -0.141150    -0.414031    -0.420800 -1.116039 -1.659943  
4         -0.515991     0.334509    -0.076915 -1.116039 -1.659943  


In [34]:
print(winter_data.isnull().sum())

date                 0
station_name         0
elevation            0
latitude             0
longitude            0
                    ..
30d_precip_max       0
30d_snowdepth_max    0
30d_soiltemp_max     0
month                0
year                 0
Length: 77, dtype: int64


In [36]:
#Split into testing and training datasets using dummy variables
X = snow_dummies  # Features
y = snow_numeric['snowdepth']  # Target variable (example)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
X_train.shape, X_test.shape

((5706, 2204), (1427, 2204))

In [40]:
y_train.shape, y_test.shape

((5706,), (1427,))

In [42]:
#Overview questions:
#Does my dataset have any categorical data? Yes: station_names
#Do my features have data values that range from 0-100 or 0-1 or both and more? snow ranges from 0 - 155 ; most other features range from 0-100 ; no binary variables unless provided dummy values

print(winter_data.dtypes)

date                  object
station_name          object
elevation              int64
latitude             float64
longitude            float64
                      ...   
30d_precip_max       float64
30d_snowdepth_max    float64
30d_soiltemp_max     float64
month                  int64
year                   int64
Length: 77, dtype: object


In [44]:
train_mean = y_train.mean()
train_mean

34.745706274097444

In [46]:
#Test the mean indicator 

dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[34.74570627]])

In [50]:

datapath = 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-Snow-Prediction/Data'
filename = 'snow_preprocessed_data.csv'  # Define the filename

filepath = os.path.join(datapath, filename)

scaled_snow.to_csv(filepath, index=False)
print(f"Data saved successfully to '{filepath}'")


Data saved successfully to 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-Snow-Prediction/Data\snow_preprocessed_data.csv'
