# Bonus: Temperature Analysis I

In [2]:
import pandas as pd
from datetime import datetime as dt

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv('../Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [5]:
# Set the date column as the DataFrame index
df2 = df.set_index('date')
df2.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [6]:
# Drop the date column
# (already removed so dropping prcp column instead since we will not be using it for this project)
data = df2.drop('prcp', axis='columns')
data.head()

Unnamed: 0_level_0,station,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,USC00519397,65
2010-01-02,USC00519397,63
2010-01-03,USC00519397,74
2010-01-04,USC00519397,76
2010-01-06,USC00519397,73


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [8]:
# Filter data for desired months
june = data[data.index.strftime('%m') == '06']
dec = data[data.index.strftime('%m') == '12']

In [9]:
# Identify the average temperature for June
print(len(june))
june['tobs'].mean()

1700


74.94411764705882

In [10]:
# Identify the average temperature for December
print(len(dec))
dec['tobs'].mean()

1517


71.04152933421226

In [75]:
# Display Temperataure Averages by Year
yrs = june.groupby(june.index.strftime('%Y')).mean()
yrsJ = yrs[yrs.index != '2017']
yrsD = dec.groupby(dec.index.strftime('%Y')).mean()

y = yrsJ.merge(yrsD, on='date')
years = y.rename(columns={'tobs_x':'June Temps', 'tobs_y':'Dec Temps'})

years

Unnamed: 0_level_0,June Temps,Dec Temps
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,74.92562,70.208511
2011,73.938326,70.820628
2012,74.0,71.188073
2013,74.599078,71.094017
2014,75.027907,69.896861
2015,74.990148,73.423913
2016,75.175258,71.13


In [63]:
# Create collections of temperature data

# Use samples to get lists to equal size
samp = june.sample(500)
samp2 = dec.sample(500)

# Remove any null values from lists
june_list = [x for x in samp['tobs'] if type(x) == int]
dec_list = [x for x in samp2['tobs'] if type(x) == int]


In [62]:
# Run paired t-test
stats.ttest_rel(june_list, dec_list)

Ttest_relResult(statistic=17.51322054132918, pvalue=6.999953044691871e-54)

### Analysis