# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt
from scipy import stats

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df = df.dropna()
df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
5,USC00519397,2010-01-07,0.06,70
...,...,...,...,...
19543,USC00516128,2017-08-17,0.13,72
19545,USC00516128,2017-08-19,0.09,71
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [3]:
# Convert the date column format from string to datetime
df['date']= pd.to_datetime(df['date'])

df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
5,USC00519397,2010-01-07,0.06,70


In [4]:
# Set the date column as the DataFrame index
df.set_index('date', inplace=True)
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-07,USC00519397,0.06,70


In [5]:
# Drop the date column

### Compare June and December data across all years 

In [6]:
# Filter data for June
june_df = df.loc[(df.index.month == 6)]
june_df = june_df[['tobs']]
june_df

Unnamed: 0_level_0,tobs
date,Unnamed: 1_level_1
2010-06-01,78
2010-06-02,76
2010-06-03,78
2010-06-04,76
2010-06-05,77
...,...
2017-06-26,79
2017-06-27,74
2017-06-28,74
2017-06-29,76


In [7]:
# Filter data for December
dec_df = df.loc[(df.index.month == 12)]
dec_df = dec_df[['tobs']]
dec_df

Unnamed: 0_level_0,tobs
date,Unnamed: 1_level_1
2010-12-01,76
2010-12-03,74
2010-12-04,74
2010-12-06,64
2010-12-07,64
...,...
2016-12-27,71
2016-12-28,71
2016-12-29,69
2016-12-30,65


In [8]:
# Identify the average temperature for June
avg_june_temp = round(june_df['tobs'].mean(), 2)
print(f'Average temperature in June: {avg_june_temp}')

Average temperature in June: 74.89


In [9]:
# Identify the average temperature for December
avg_dec_temp = round(dec_df['tobs'].mean(), 2)
print(f'Average temperature in December: {avg_dec_temp}')

Average temperature in December: 70.93


In [10]:
# Run paired t-test
t_test = stats.ttest_1samp(june_df, dec_df.mean())

t_test = t_test[1][0]
print(f'p-value = {t_test}')

p-value = 2.52608096318e-312


### Analysis

A paired t-test is used in this situation because the data is related, in that it is collected from the same set of weather stations through two different periods of time.

This test is used to compare the differences between, in this case, the mean temperature in Hawaii in June and December. So, the null hypothesis of this test is that there is no significant statistical difference between the two means. Because the p-value is below .05, we can reject the null hypthesis and conclude there is a significant difference between the mean temperature in June and the mean temperature in December.