In [2]:
import boto3
import pandas as pd
import pyspark
from sagemaker import get_execution_role
import seaborn as sns
import numpy as np
import calendar
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import norm, skew
import statsmodels.api as sm
blue = sns.color_palette('Blues')[-2]
color = sns.color_palette() 
sns.set_style('darkgrid') 
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
import nltk
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
import statsmodels.api as sm
import statsmodels.formula.api as smf
import datetime


In [3]:
weather_data = 's3://taysolsdev/datasets/GolfDataforecast/weatherdata.csv'
weather_data = pd.read_csv(weather_data, encoding = "ISO-8859-1")

In [4]:
weather_data = weather_data.rename(str.lower, axis='columns')

In [5]:
weather_data = weather_data.rename(index=str, columns={"high ": "high"})
weather_data = weather_data.rename(index=str, columns={"daylight hours": "daylight_hours"})
weather_data.columns

Index(['region', 'month', 'high', 'low', 'rainfall', 'snowfall',
       'daylight_hours'],
      dtype='object')

In [6]:
# no missing value
weather_data.isnull().sum()

region            0
month             0
high              0
low               0
rainfall          0
snowfall          0
daylight_hours    0
dtype: int64

In [7]:
weather_data['month'] = weather_data['month'].str.replace('M10', '10')
weather_data['month'] = weather_data['month'].str.replace('M11', '11')
weather_data['month'] = weather_data['month'].str.replace('M12', '12')
weather_data['month'] = weather_data['month'].str.replace('M1', '01')
weather_data['month'] = weather_data['month'].str.replace('M2', '02')
weather_data['month'] = weather_data['month'].str.replace('M3', '03')
weather_data['month'] = weather_data['month'].str.replace('M4', '04')
weather_data['month'] = weather_data['month'].str.replace('M5', '05')
weather_data['month'] = weather_data['month'].str.replace('M6', '06')
weather_data['month'] = weather_data['month'].str.replace('M7', '07')
weather_data['month'] = weather_data['month'].str.replace('M8', '08')
weather_data['month'] = weather_data['month'].str.replace('M9', '09')

In [8]:
weather_data["month"][:len(weather_data)] =  pd.to_datetime(weather_data.month[:len(weather_data)], 
                                                          yearfirst = True, format='%Y%m')

weather_data = weather_data.sort_values(by="month")
# set month to index
weather_data.index = pd.DatetimeIndex(weather_data.month)
del weather_data["month"]

In [9]:
weather_data['region'].unique()

array(['Midwest', 'Illinois', 'Canada', 'Northeast', 'South Carolina',
       'Oregon', 'Connecticut', 'Other Southeast', 'New Mexico',
       'Georgia', 'Australia', 'Arizona', 'Other Northeast', 'New York',
       'Other Midwest', 'Mexico', 'Ohio', 'South Korea',
       'Other Southwest', 'Japan', 'California', 'Texas', 'New Jersey',
       'Southeast', 'Florida', 'Michigan', 'Nevada', 'Southwest'],
      dtype=object)

### check stationary or not


In [10]:
from statsmodels.tsa.stattools import adfuller

products = weather_data.region.unique()
products_stationary = {} # collect stationary data: key is the name or product, value is p_value
products_nonstationary = {} # collect non-stationary data 


"""
Run ADF test on each product and check test.
"""
for feature in weather_data.columns[1:]:
    for prod in products:
        prod_df = weather_data[weather_data.region == prod]
        p_value = adfuller(prod_df[feature])[1]
        if p_value <0.01:  # we do at the 1% comcident level.
            products_stationary[prod] = p_value
        else:
            products_nonstationary[prod] = p_value 
    
    print("{} regions have non-stationary data for {}".format(len(products_nonstationary), feature))
    
    

15 regions have non-stationary data for high
17 regions have non-stationary data for low
22 regions have non-stationary data for rainfall
26 regions have non-stationary data for snowfall
27 regions have non-stationary data for daylight_hours


### New dataset 

In [11]:
origin_high = weather_data[["high",'region']]
origin_high['month'] = origin_high.index
origin_high=origin_high[['month',"high",'region']]
origin_high.to_csv('nonstationary_high.csv', header=False, index=False)

In [12]:
origin_low = weather_data[["low",'region']]
origin_low['month'] = origin_low.index
origin_low=origin_low[['month',"low",'region']]
origin_low.to_csv('nonstationary_low.csv', header=False, index=False)


In [13]:

origin_rainfall = weather_data[["rainfall",'region']]
origin_rainfall['month'] = origin_rainfall.index
origin_rainfall=origin_rainfall[['month',"rainfall",'region']]
origin_rainfall.to_csv('nonstationary_rainfall.csv', header=False, index=False)


In [14]:

origin_snowfall = weather_data[["snowfall",'region']]
origin_snowfall['month'] = origin_snowfall.index
origin_snowfall=origin_snowfall[['month',"snowfall",'region']]
origin_snowfall.to_csv('nonstationary_snowfall.csv', header=False, index=False)


In [15]:

origin_daylight_hours = weather_data[["daylight_hours",'region']]
origin_daylight_hours['month'] = origin_daylight_hours.index
origin_daylight_hours=origin_daylight_hours[['month',"daylight_hours",'region']]
origin_daylight_hours.to_csv('nonstationary_daylight_hours.csv', header=False, index=False)


In [17]:
origin_all = weather_data[["high","low","rainfall","snowfall","daylight_hours",'region']]
origin_all['month'] = origin_high.index
origin_all=origin_all[['month',"high","low","rainfall","snowfall","daylight_hours",'region']]
origin_all.to_csv('nonstationary_all.csv', header=False, index=False)