#### Library

In [1]:
import pandas as pd

#### Import Data and Set Parameter

In [2]:
df = pd.read_csv("complete_data_sorted.csv")
particle = 'RH_Measurement'

#### Functions

In [3]:
def create_particle_df(df, target):
    temp = df[df[target].notnull() | df['PM_Measurement'].notnull()]
    col = ['Year', 'Date', 'Time', 'AQS_Code', 'Method Name', target, 'PM_Measurement']
    target_pm = temp[col].copy()
    
    target_pm['datetime'] = pd.to_datetime(target_pm['Date'] + ' ' + target_pm['Time'])
    target_pm.set_index('datetime', inplace=True)
    result = target_pm.drop(['Year','Date', 'Time'], axis=1)

    return result

def method_count_per_particle(df):
    temp = df.drop(['PM_Measurement'], axis=1)
    temp = temp.dropna()
    
    print(temp['Method Name'].value_counts())

#### Get DataFrame where PM OR Particle is not null

In [4]:
a = create_particle_df(df, particle)
a

Unnamed: 0_level_0,AQS_Code,Method Name,RH_Measurement,PM_Measurement
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 08:00:00,1103,Met-one BAM-1020 W/PM2.5 SCC - Beta Attenuation,,46.0
2015-01-01 08:00:00,1103,Instrumental - Rotronic HC2-S3,49.0,
2015-01-01 08:00:00,1201,Met-One BAM-1020 W/PM2.5 SCC - Beta Attenuation,,95.0
2015-01-01 09:00:00,1103,Met-one BAM-1020 W/PM2.5 SCC - Beta Attenuation,,59.0
2015-01-01 09:00:00,1103,Instrumental - Rotronic HC2-S3,52.0,
...,...,...,...,...
2022-07-01 05:00:00,1201,Met-One BAM-1020 W/PM2.5 SCC - Beta Attenuation,,15.4
2022-07-01 06:00:00,1103,Met-one BAM-1020 W/PM2.5 SCC - Beta Attenuation,,13.2
2022-07-01 06:00:00,1201,Met-One BAM-1020 W/PM2.5 SCC - Beta Attenuation,,15.9
2022-07-01 07:00:00,1103,Met-one BAM-1020 W/PM2.5 SCC - Beta Attenuation,,13.7


#### Given particle, get number of occurances per method

In [5]:
method_count_per_particle(a)

Instrumental - Met One 083D       95416
Instrumental - Rotronic HC2-S3    63013
Name: Method Name, dtype: int64


In [6]:
a[a['Method Name']=='Instrumental - Met One 083D'].describe()

Unnamed: 0,AQS_Code,RH_Measurement,PM_Measurement
count,95416.0,95416.0,0.0
mean,1606.337218,60.104605,
std,296.865719,24.242855,
min,1201.0,1.0,
25%,1201.0,42.0,
50%,1602.0,62.0,
75%,2005.0,80.0,
max,2005.0,100.0,


In [7]:
a[a['Method Name']=='Instrumental - Rotronic HC2-S3'].describe()

Unnamed: 0,AQS_Code,RH_Measurement,PM_Measurement
count,63013.0,63013.0,0.0
mean,1154.331503,58.697221,
std,184.649177,23.142274,
min,1103.0,1.0,
25%,1103.0,42.0,
50%,1103.0,61.0,
75%,1103.0,78.0,
max,2005.0,100.0,


In [6]:
a_1103=a.loc[a['AQS_Code']==1103]


In [7]:
aggregation_functions = {'RH_Measurement': 'mean', 'PM_Measurement':'mean'}
a_1103_new = a_1103.groupby(a_1103.index).aggregate(aggregation_functions)


In [10]:
a_1103_new

Unnamed: 0_level_0,RH_Measurement,PM_Measurement
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 08:00:00,49.0,46.0
2015-01-01 09:00:00,52.0,59.0
2015-01-01 10:00:00,55.0,66.0
2015-01-01 11:00:00,59.0,54.0
2015-01-01 12:00:00,56.0,50.0
...,...,...
2022-07-01 03:00:00,,13.5
2022-07-01 04:00:00,,13.2
2022-07-01 05:00:00,,11.8
2022-07-01 06:00:00,,13.2


In [11]:
a_1103_new.index.min()

Timestamp('2015-01-01 08:00:00')

In [25]:
print(len(set(pd.date_range('2015-01-01 08:00:00', '2022-07-01 07:00:00', freq="H"))))

65712


In [12]:
a_1103_new.index.max()

Timestamp('2022-07-01 07:00:00')

In [12]:
##Finding the missing number of Timestamps
expected = set(pd.date_range('2015-01-01 08:00:00', '2022-07-01 07:00:00', freq="H"))
actual = set(a_1103_new.index.tolist())
diff = expected.difference(actual)
print(len(diff))

0


In [13]:
#Find the missing number of values in RH_Measurement
sum(pd.isnull(a_1103_new['RH_Measurement']))

9170

In [11]:
#Resampling them
a_1103_new = a_1103_new.resample('1H', ).asfreq()

In [16]:
a_1103_new['RH_Measurement'].isna().sum()

0

In [15]:
a_1103_new=a_1103_new.ffill(axis = 0) #Forward Rolling


In [17]:
a_1103_new ['AQS_Code']=1103

In [26]:
a_1201=a.loc[a['AQS_Code']==1201]
a_1201_new = a_1201.groupby(a_1201.index).aggregate(aggregation_functions)


In [19]:
a_1201_new.index.min()

Timestamp('2015-01-01 08:00:00')

In [20]:
a_1201_new.index.max()

Timestamp('2022-07-01 07:00:00')

In [27]:
expected = set(pd.date_range('2015-01-01 08:00:00', '2022-07-01 07:00:00', freq="1H"))
actual = set(a_1201_new.index.tolist())
diff = expected.difference(actual)
print(len(diff))

1762


In [36]:
a_1201_new['RH_Measurement'].isna().sum()

0

In [29]:
#Resampling them
a_1201_new = a_1201_new.resample('1H').asfreq()

In [33]:
a_1201_new=a_1201_new.ffill(axis = 0)

In [35]:
a_1201_new['RH_Measurement'].fillna((a_1201_new['RH_Measurement'].mean()), inplace=True)

In [37]:
a_1201_new ['AQS_Code']=1201

In [38]:
RH=a_1103_new.append(a_1201_new)

In [39]:
RH

Unnamed: 0_level_0,RH_Measurement,PM_Measurement,AQS_Code
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-01 08:00:00,49.0,46.0,1103
2015-01-01 09:00:00,52.0,59.0,1103
2015-01-01 10:00:00,55.0,66.0,1103
2015-01-01 11:00:00,59.0,54.0,1103
2015-01-01 12:00:00,56.0,50.0,1103
...,...,...,...
2022-07-01 03:00:00,18.0,11.2,1201
2022-07-01 04:00:00,18.0,13.0,1201
2022-07-01 05:00:00,18.0,15.4,1201
2022-07-01 06:00:00,18.0,15.9,1201


In [40]:
sum(pd.isnull(RH['RH_Measurement']))
# count_nan = len(RH) - RH.count()
# print(count_nan)

0

In [1]:
RH.to_csv('RH.csv')

NameError: name 'RH' is not defined

In [2]:
import pandas as pd
RH=pd.read_csv('RH.csv')

In [8]:
RH[RH['AQS_Code']==1103].describe()

Unnamed: 0,RH_Measurement,PM_Measurement,AQS_Code
count,65712.0,65712.0,65712.0
mean,62.938063,15.35802,1103.0
std,23.283054,11.157344,0.0
min,1.0,-6.0,1103.0
25%,47.0,9.0,1103.0
50%,67.0,14.0,1103.0
75%,84.0,19.0,1103.0
max,100.0,541.0,1103.0


In [9]:
RH[RH['AQS_Code']==1201].describe()

Unnamed: 0,RH_Measurement,PM_Measurement,AQS_Code
count,65712.0,65712.0,65712.0
mean,43.697962,14.506716,1201.0
std,21.002751,10.350917,0.0
min,1.0,-3.7,1201.0
25%,27.0,8.3,1201.0
50%,43.697962,12.8,1201.0
75%,47.0,19.1,1201.0
max,100.0,995.6,1201.0


In [4]:
RH[RH['datetime']=='2015-01-01 08:00:00']


RH

Unnamed: 0,datetime,RH_Measurement,PM_Measurement,AQS_Code
0,2015-01-01 08:00:00,49.0,46.0,1103
65712,2015-01-01 08:00:00,43.697962,95.0,1201
