# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()


Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [4]:
# Convert the date column format from string to datetime
df[["date"]] = df[["date"]].apply(pd.to_datetime)
df


Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [5]:
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [6]:
df['convertdate'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df

Unnamed: 0,station,date,prcp,tobs,convertdate
0,USC00519397,2010-01-01,0.08,65,2010-01-01
1,USC00519397,2010-01-02,0.00,63,2010-01-02
2,USC00519397,2010-01-03,0.00,74,2010-01-03
3,USC00519397,2010-01-04,0.00,76,2010-01-04
4,USC00519397,2010-01-06,,73,2010-01-06
...,...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71,2017-08-19
19546,USC00516128,2017-08-20,,78,2017-08-20
19547,USC00516128,2017-08-21,0.56,76,2017-08-21
19548,USC00516128,2017-08-22,0.50,76,2017-08-22


In [7]:
#df['year'] = df['convertdate'].dt.year
#df['month'] = df['convertdate'].dt.month
#df['day'] = df['convertdate'].dt.day
#df

In [9]:
# Set the date column as the DataFrame index
df.set_index('date')

Unnamed: 0_level_0,station,prcp,tobs,convertdate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,0.08,65,2010-01-01
2010-01-02,USC00519397,0.00,63,2010-01-02
2010-01-03,USC00519397,0.00,74,2010-01-03
2010-01-04,USC00519397,0.00,76,2010-01-04
2010-01-06,USC00519397,,73,2010-01-06
...,...,...,...,...
2017-08-19,USC00516128,0.09,71,2017-08-19
2017-08-20,USC00516128,,78,2017-08-20
2017-08-21,USC00516128,0.56,76,2017-08-21
2017-08-22,USC00516128,0.50,76,2017-08-22


In [10]:
# Drop the date column
df.drop(df.columns[1], axis=1, inplace=True)
df


Unnamed: 0,station,prcp,tobs,convertdate
0,USC00519397,0.08,65,2010-01-01
1,USC00519397,0.00,63,2010-01-02
2,USC00519397,0.00,74,2010-01-03
3,USC00519397,0.00,76,2010-01-04
4,USC00519397,,73,2010-01-06
...,...,...,...,...
19545,USC00516128,0.09,71,2017-08-19
19546,USC00516128,,78,2017-08-20
19547,USC00516128,0.56,76,2017-08-21
19548,USC00516128,0.50,76,2017-08-22


### Compare June and December data across all years 

In [11]:
from scipy import stats

In [12]:
# Filter data for desired months
df[df['convertdate'].dt.month_name() == 'month']
df


Unnamed: 0,station,prcp,tobs,convertdate
0,USC00519397,0.08,65,2010-01-01
1,USC00519397,0.00,63,2010-01-02
2,USC00519397,0.00,74,2010-01-03
3,USC00519397,0.00,76,2010-01-04
4,USC00519397,,73,2010-01-06
...,...,...,...,...
19545,USC00516128,0.09,71,2017-08-19
19546,USC00516128,,78,2017-08-20
19547,USC00516128,0.56,76,2017-08-21
19548,USC00516128,0.50,76,2017-08-22


In [13]:
# Identify the average temperature for June

june = df[df['convertdate'].dt.month_name() == 'June']

june



Unnamed: 0,station,prcp,tobs,convertdate
133,USC00519397,0.00,78,2010-06-01
134,USC00519397,0.01,76,2010-06-02
135,USC00519397,0.00,78,2010-06-03
136,USC00519397,0.00,76,2010-06-04
137,USC00519397,0.00,77,2010-06-05
...,...,...,...,...
19492,USC00516128,0.02,79,2017-06-26
19493,USC00516128,0.10,74,2017-06-27
19494,USC00516128,0.02,74,2017-06-28
19495,USC00516128,0.04,76,2017-06-29


In [14]:
# Identify the average temperature for December
December = df[df['convertdate'].dt.month_name() == 'December']
December

Unnamed: 0,station,prcp,tobs,convertdate
305,USC00519397,0.04,76,2010-12-01
306,USC00519397,0.00,74,2010-12-03
307,USC00519397,0.00,74,2010-12-04
308,USC00519397,0.00,64,2010-12-06
309,USC00519397,0.00,64,2010-12-07
...,...,...,...,...
19323,USC00516128,0.14,71,2016-12-27
19324,USC00516128,0.14,71,2016-12-28
19325,USC00516128,1.03,69,2016-12-29
19326,USC00516128,2.37,65,2016-12-30


In [15]:
# Create collections of temperature data


In [16]:
# Run paired t-test
#cat1 = my_data[my_data['Category']=='cat1']
#cat2 = my_data[my_data['Category']=='cat2']
#ttest_ind(cat1['values'], cat2['values'])


from scipy.stats import ttest_ind

cat1 = june['tobs']

cat2 = December['tobs']

ttest_ind(cat1, cat2)


Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

In [17]:
#Higher values of the t-value, also called t-score,
#indicate that a large difference exists between the two sample sets.
#The smaller the t-value, the more similarity exists between the two sample sets. 
#A large t-score indicates that the groups are different. A small t-score indicates that the groups are similar.

#Below 0.05, significant. Over 0.05, not significant.

#pvalue is 3.9 > 0.05 ------> not significant