# Bonus: Temperature Analysis I

In [99]:
import pandas as pd
from datetime import datetime as dt

In [100]:
# "tobs" is "temperature observations"
df = pd.read_csv('../Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [101]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [102]:
# Set the date column as the DataFrame index
df2 = df.set_index('date')
df2.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [103]:
# Drop the date column
# (already removed so dropping prcp column instead since we will not be using it for this project)
data = df2.drop('prcp', axis='columns')
data.head()

Unnamed: 0_level_0,station,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,USC00519397,65
2010-01-02,USC00519397,63
2010-01-03,USC00519397,74
2010-01-04,USC00519397,76
2010-01-06,USC00519397,73


### Compare June and December data across all years 

In [104]:
from scipy import stats

In [105]:
# Filter data for desired months
june = data[data.index.strftime('%m') == '06']
dec = data[data.index.strftime('%m') == '12']

#Removing Jun 2017 from the dataset so that both months have an equal chance of
#being chosen from a random sample
june = june[june.index.strftime('%Y') != '2017']
june.tail()

Unnamed: 0_level_0,station,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-06-25,USC00516128,73
2016-06-26,USC00516128,74
2016-06-27,USC00516128,72
2016-06-28,USC00516128,74
2016-06-29,USC00516128,72


In [106]:
# Identify the average temperature for June
print(len(june))
june['tobs'].mean()

1509


74.65606361829026

In [107]:
# Identify the average temperature for December
print(len(dec))
dec['tobs'].mean()

1517


71.04152933421226

In [108]:
# Display Temperataure Averages by Year
yrsJ = june.groupby(june.index.strftime('%Y')).mean()
yrsD = dec.groupby(dec.index.strftime('%Y')).mean()

yrs = yrsJ.merge(yrsD, on='date')
years = yrs.rename(columns={'tobs_x':'June Temps', 'tobs_y':'Dec Temps'})

years

Unnamed: 0_level_0,June Temps,Dec Temps
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,74.92562,70.208511
2011,73.938326,70.820628
2012,74.0,71.188073
2013,74.599078,71.094017
2014,75.027907,69.896861
2015,74.990148,73.423913
2016,75.175258,71.13


In [109]:
# Create collections of temperature data

# Use samples to get lists to equal size
samp = june.sample(500)
samp2 = dec.sample(500)

# Remove any null values from lists
june_list = [x for x in samp['tobs'] if type(x) == int]
dec_list = [x for x in samp2['tobs'] if type(x) == int]


In [110]:
# Run paired t-test
stats.ttest_rel(june_list, dec_list)

Ttest_relResult(statistic=15.767104625469415, pvalue=9.669406165507983e-46)

### Analysis

Above, we have collected a random sample of 500 temperature observations each from the months of June and December across the years 2010-2016 and from the same nine measurement stations in the state of Hawaii.  We have here used this information to run a paired t-Test to determine if there is a significant difference in the average temperature in Hawaii throughout different times of year.  We chose to use a paired rather than unpaired t-Test because we are testing the same stations at different points in time, so the samples are dependent and related to each other.  We have also removed the June 2017 data points to be sure both months have a relatively equal chance of being chosen in a random sampling.

Results consistently show a high p-value, indicating that we cannot reject the null hypothesis: that the mean of the paired differences is equal to zero in the population.  This means that there is little difference between the temperatures in Hawaii in June vs December across all the years and stations in our dataset.  This is good news for vacation plans!