In [70]:
%%file database_operations.py

#this is a magic command, used for creating the python file that the notebook will use. 

import re
import json
from mrjob.job import MRJob
#quality re is a way of setting a variable to multiple values. 
QUALITY_RE = re.compile(r"[01459]")

class TemperatureData(MRJob):
    
    
    #this is my mapper. it takes in a line from the file, strips it into individual chars and performs data sorting.
    #this one specifically arranges values into 3 variables, wind, temp and quality.
    #to ensure quality data, tests are performed in this stage: seen in the if statement. 
    #it will yield appropriate values from each line labeled temp and count. 
    #note that count is always 1 because there can only be a max of 1 record per line. Convenient!
    
    def mapper(self, _, line):
        val = line.strip()
        (wind, temp, q) = (val[60:63], val[87:92], val[63:64])
        if(temp != "+9999" and re.match(QUALITY_RE, q) and wind != "999"):
            yield int(wind),{"temp":(int(temp)), "count": 1}
            #note that this returns a dictionary data type where wind is the key and an object with values for temp and count are the values.
            
            
    #This is my reducer function. it takes in the values passed along by the mapper, compares them and finalizes results. 
    #it goes through all the stored values of the dictionary to compare and calculate max, min and also give a count. 
    #this returns the labeled results as low, high and count, as well as the respective key for each. 
    def reducer(self, key, values):
        count = 0
        maxTemp = 0
        lowTemp = 100
        for x in values:
            count += x["count"]
            temp = x["temp"]
            
            if(temp>maxTemp):
                maxTemp = temp
            elif(temp<lowTemp):
                lowTemp = temp
        
        yield key, {"low": lowTemp,"high": maxTemp,  "count": count}
    
            
        
        
    
if __name__ == '__main__':
    TemperatureData.run()

Overwriting database_operations.py


In [1]:
!python database_operations.py -r local 1901 1902

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /var/folders/fx/spwsyt2x2t7fkh0qrj_tt39r0000gn/T/database_operations.alexsanna.20230224.084904.188229
Running step 1 of 1...
job output is in /var/folders/fx/spwsyt2x2t7fkh0qrj_tt39r0000gn/T/database_operations.alexsanna.20230224.084904.188229/output
Streaming final output from /var/folders/fx/spwsyt2x2t7fkh0qrj_tt39r0000gn/T/database_operations.alexsanna.20230224.084904.188229/output...
290	{"low":-328,"high":306,"count":379}
320	{"low":-311,"high":306,"count":1152}
20	{"low":-272,"high":317,"count":582}
200	{"low":-183,"high":300,"count":688}
230	{"low":-228,"high":283,"count":1488}
250	{"low":-222,"high":311,"count":604}
270	{"low":-211,"high":278,"count":931}
70	{"low":-333,"high":278,"count":502}
90	{"low":-267,"high":272,"count":567}
160	{"low":-239,"high":289,"count":647}
180	{"low":-250,"high":294,"count":879}
340	{"low":-300,"high":311,"count":427}
360	{"low":-267