In [6]:
#import relevant packages
import numpy as np
import pandas as pd
import os

#import data as .csv file and create a dataframe using pandas
sakura_data = pd.read_csv('hirosaki_temp_cherry_bloom.csv')
sakura_df = pd.DataFrame(sakura_data)

In [7]:
#start checking out the data and getting summary info 
sakura_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9131 entries, 0 to 9130
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           9131 non-null   object 
 1   temperature    9131 non-null   float64
 2   flower_status  75 non-null     object 
dtypes: float64(1), object(2)
memory usage: 214.1+ KB


In [8]:
sakura_df.head()

#there are only 3 columns and it seems like "flower_status" is missing a lot of values (because the flowers only bloom for a short period of time per year)

Unnamed: 0,date,temperature,flower_status
0,1997/1/1,2.9,
1,1997/1/2,2.2,
2,1997/1/3,-1.6,
3,1997/1/4,0.2,
4,1997/1/5,-0.4,


In [11]:
#split date column into three parts
dates = sakura_df['date'].str.split('/', expand=True)
sakura_df['year'], sakura_df['month'], sakura_df['day'] = dates[0], dates[1], dates[2]

#check result
sakura_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9131 entries, 0 to 9130
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           9131 non-null   object 
 1   temperature    9131 non-null   float64
 2   flower_status  75 non-null     object 
 3   year           9131 non-null   object 
 4   month          9131 non-null   object 
 5   day            9131 non-null   object 
dtypes: float64(1), object(5)
memory usage: 428.1+ KB


In [17]:
#filling in flower status column
# bloom: 0, full bloom: 1, scatter: 2, before/after/not blooming: 4
sakura_df2 = []
for x in range(len(sakura_df)):
    year = sakura_df['year'][x]
    month = sakura_df['month'][x]
    day = sakura_df['day'][x]
    temperature = sakura_df['temperature'][x]
    flower_status = sakura_df['flower_status'][x]
    if flower_status == 'bloom':
        status = 0
    elif flower_status == 'full':
        status = 1
    elif flower_status == 'scatter':
        status = 2
    else:
        status = 4
    sakura_dict = {'year':year, 'month':month, 'day':day, 'temperature':temperature, 'flower_status':status}
    sakura_df2.append(sakura_dict)

sakura_df2 = pd.DataFrame(sakura_df2)

sakura_df2.head()

Unnamed: 0,year,month,day,temperature,flower_status
0,1997,1,1,2.9,4
1,1997,1,2,2.2,4
2,1997,1,3,-1.6,4
3,1997,1,4,0.2,4
4,1997,1,5,-0.4,4


In [23]:
#count of all flower status values that actually indicate a bloom
sakura_df2[['month', 'flower_status']].value_counts()

#from this data we can see that the bloom has only occurred during the months of April and May in our dataset

month  flower_status
5      1                  7
       2                 12
4      2                 13
       1                 18
       0                 25
       4                694
2      4                706
11     4                750
9      4                750
6      4                750
5      4                756
3      4                775
7      4                775
12     4                775
8      4                775
10     4                775
1      4                775
dtype: int64

In [25]:
#creating a new dataframe to encapsulate only the blooming months
sakura_bloom_df = []
for x in range(len(sakura_df2)):
    month = sakura_df2['month'][x]
    if month == '4' or month == '5': #April or May
        sakura_dict = {'month':month, 'day':sakura_df2['day'][x], 'temperature':sakura_df2['temperature'][x], 'flower_status':sakura_df2['flower_status'][x]}
        sakura_bloom_df.append(sakura_dict)

sakura_bloom_df = pd.DataFrame(sakura_bloom_df)
sakura_bloom_df.head()


Unnamed: 0,month,day,temperature,flower_status
0,4,1,6.1,4
1,4,2,5.7,4
2,4,3,7.5,4
3,4,4,7.4,4
4,4,5,6.6,4
