# Bonus: Temperature Analysis I

In [275]:
import pandas as pd
from datetime import datetime as dt

In [276]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [277]:
# Convert the date column format from string to datetime
df['date']= pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [278]:
# Determine Date ranges 
display(df['date'].max())
display(df['date'].min())

Timestamp('2017-08-23 00:00:00')

Timestamp('2010-01-01 00:00:00')

In [279]:
# To avoid Unequal sample sizes for Paired t-Test Drop 2017 data since no data is available for December 2017.
df_filtered = df.loc[ df['date'].dt.year < 2017 ]
df_filtered

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19323,USC00516128,2016-12-27,0.14,71
19324,USC00516128,2016-12-28,0.14,71
19325,USC00516128,2016-12-29,1.03,69
19326,USC00516128,2016-12-30,2.37,65


In [280]:
# Set the date column as the DataFrame index
df_filtered.set_index(['date'], inplace =True)
df_filtered.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [281]:
# June doesn't have 31 days so excluding December 31 values for paired t test
df_clean = df_filtered[~((df_filtered.index.month==12) & (df_filtered.index.day==31))]
df_clean

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2016-12-26,USC00516128,1.48,71
2016-12-27,USC00516128,0.14,71
2016-12-28,USC00516128,0.14,71
2016-12-29,USC00516128,1.03,69


### Compare June and December data across all years 

In [282]:
from scipy import stats

In [283]:
# Filter data for December 
df_june = df_clean [df_clean.index.month == 6]
df_june.info()
df_june

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1509 entries, 2010-06-01 to 2016-06-29
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  1509 non-null   object 
 1   prcp     1403 non-null   float64
 2   tobs     1509 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 47.2+ KB


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-06-01,USC00519397,0.00,78
2010-06-02,USC00519397,0.01,76
2010-06-03,USC00519397,0.00,78
2010-06-04,USC00519397,0.00,76
2010-06-05,USC00519397,0.00,77
...,...,...,...
2016-06-25,USC00516128,,73
2016-06-26,USC00516128,0.70,74
2016-06-27,USC00516128,0.30,72
2016-06-28,USC00516128,0.25,74


In [284]:
# Identify average temperature in June at all stations across all available years in the dataset
df_june_mean= df_june['tobs'].mean(skipna = True)
df_june_mean

74.65606361829026

In [285]:
# Filter data for December 
df_december = df_clean[df_clean.index.month == 12]
# df_december['tobs'].count()
df_december.info()
df_december

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1474 entries, 2010-12-01 to 2016-12-30
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  1474 non-null   object 
 1   prcp     1365 non-null   float64
 2   tobs     1474 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 46.1+ KB


Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-12-01,USC00519397,0.04,76
2010-12-03,USC00519397,0.00,74
2010-12-04,USC00519397,0.00,74
2010-12-06,USC00519397,0.00,64
2010-12-07,USC00519397,0.00,64
...,...,...,...
2016-12-26,USC00516128,1.48,71
2016-12-27,USC00516128,0.14,71
2016-12-28,USC00516128,0.14,71
2016-12-29,USC00516128,1.03,69


In [286]:
# Identify average temperature in December at all stations across all available years in the dataset
df_dec_mean= df_december['tobs'].mean(skipna = True)
df_dec_mean

71.12211668928087

In [290]:
#Run unpaired t-test
df_jun = df_filtered[df_filtered.index.month == 6]
df_dec = df_filtered[df_filtered.index.month == 12]

res = stats.ttest_ind(df_jun['tobs'], df_dec['tobs'], equal_var=True)
display(res)


Ttest_indResult(statistic=28.468024094083606, pvalue=3.827174178322451e-158)

In [None]:
#Paired T test can't be performed 
# res2 = stats.ttest_rel(df_june['tobs'], df_december['tobs'])
# display(res2)


# Analysis

 * June temp Count:1509
 * December temp Count: 1474
 
We do not have as many observations in  June  as in December. Paired t-test will not make any statistical sense in this scenario. 

## Unpaired T-Test

* Hawaii is reputed to enjoy mild weather all year. This statistical test compares the average temperatures in June and December and tells us whether there is a meaningful difference between the temperature.

* The null hypothesis is that there is no statistically significant difference in the mean of June average temperature and December average temperature in Hawaii.

## Interpretation

The p-value of the test is 3.8, which is more than the significant level, most commonly 0.05,so the null hypothesis is rejected and it can be concluded that June average tempratures in Hawaii are statistically different from the December average temperatures. In other words, the temeratures differnces could not have happened by chance.
