In [10]:
#%pip install pyarrow
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent / "src"))
from IPython.display import display, HTML
# This CSS forces the table to stay wide and provides a horizontal scrollbar
style = """
<style>
    .rendered_html table {
        display: block;
        overflow-x: auto;
        white-space: nowrap !important;
    }
</style>
"""
display(HTML(style))



In [11]:
import numpy as np
import pandas as pd
import io
from openweather_pipeline.s3_operations import S3Operations
from openweather_pipeline.config_manager import get_config


In [12]:
config = get_config().config
source_bucket = config.get("s3", {}).get("buckets", {}).get("source_bucket")
processed_prefix = config.get("s3", {}).get("buckets", {}).get("processed_prefix")
processed_file_name = (config.get("s3", {}).get("buckets", {}).get("processed_file_name"))
region = config.get("aws", {}).get("region", "us-east-1")


In [13]:
# Read processed parquet file
process_file_key =f"{processed_prefix}/{processed_file_name}"
s3_operations = S3Operations(bucket=source_bucket, region=region)
content = s3_operations.read_file_as_bytes(process_file_key)
df= pd.read_parquet(io.BytesIO(content))
display(df.head())

Unnamed: 0,date,cloud_cover,humidity,precipitation,pressure,temperature,temperature_min,temperature_max,wind_speed,wind_direction,zipcode
0,2020-01-01,75.0,41.0,0.0,1008.0,38.35,30.74,39.87,14.99,270.0,zipcode=10002
1,2020-01-01,75.0,41.0,0.25,1008.0,38.37,30.83,39.83,18.34,270.0,zipcode=11101
2,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,zipcode=10001
3,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,zipcode=10001
4,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,zipcode=10001


In [26]:
#clean zipcode data
mask = df['zipcode'].str.startswith('zipcode', na=False)
df.loc[mask, 'zipcode_clean']= df.loc[mask,'zipcode'].str.split('=').str[1]

In [28]:
df = df.drop(columns=['zipcode'])
df= df.rename(columns={'zipcode_clean':'zipcode'})

In [29]:
df

Unnamed: 0,date,cloud_cover,humidity,precipitation,pressure,temperature,temperature_min,temperature_max,wind_speed,wind_direction,zipcode
0,2020-01-01,75.0,41.0,0.00,1008.0,38.35,30.74,39.87,14.99,270.0,10002
1,2020-01-01,75.0,41.0,0.25,1008.0,38.37,30.83,39.83,18.34,270.0,11101
2,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,10001
3,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,10001
4,2020-01-02,1.0,45.0,1.01,1015.0,45.82,27.99,47.03,13.96,247.0,10001
...,...,...,...,...,...,...,...,...,...,...,...
799,2020-06-06,1.0,74.0,6.36,1009.0,82.92,68.54,87.15,18.23,333.0,11101
800,2020-06-06,20.0,69.0,5.88,1009.0,83.50,70.09,87.40,18.95,334.0,11201
801,2020-06-06,75.0,51.0,0.00,1007.0,80.71,62.82,83.39,23.04,270.0,12084
802,2020-06-07,20.0,44.0,0.25,1013.0,73.89,64.22,77.50,15.43,326.0,10003


In [31]:
# check for missing values
df.isnull().sum()

date               0
cloud_cover        0
humidity           0
precipitation      0
pressure           0
temperature        0
temperature_min    0
temperature_max    0
wind_speed         0
wind_direction     0
zipcode            0
dtype: int64

In [63]:
#define ranges for weather parameters
weather_ranges = {
    'cloud_cover':{'max':100,'min':0}, 
    'humidity':{'max':100,'min':0}, 
    'precipitation':{'max':120,'min':-20}, 
    'pressure':{'max':1100,'min':900}, 
    'temperature':{'max':120,'min':-20}, 
    'temperature_min':{'max':120,'min':-20}, 
    'temperature_max':{'max':120,'min':-20}, 
    'wind_speed':{'max':100,'min':0}, 
    'wind_direction':{'max':360,'min':0} 
} 
weather_ranges

{'cloud_cover': {'max': 100, 'min': 0},
 'humidity': {'max': 100, 'min': 0},
 'precipitation': {'max': 120, 'min': -20},
 'pressure': {'max': 1100, 'min': 900},
 'temperature': {'max': 120, 'min': -20},
 'temperature_min': {'max': 120, 'min': -20},
 'temperature_max': {'max': 120, 'min': -20},
 'wind_speed': {'max': 100, 'min': 0},
 'wind_direction': {'max': 360, 'min': 0}}

In [64]:
#check for outliers
outlier_counts=[]
col_count=0
for col in df.columns:
    if col in weather_ranges:
        col_count = len(df[(df[col] > weather_ranges.get(col,'{}').get('max')) | (df[col] < weather_ranges.get(col,'{}').get('min'))])
    if col_count > 0:
        outlier_counts.append({col:col_count})
        col_count=0
display(outlier_counts)

[]

In [68]:
# check for duplicates
df[df.duplicated(subset=["date","zipcode"], keep=False)]
#remove duplicates
df_non_dups= df.drop_duplicates(subset=["date","zipcode"], keep='first')
            
            
            

In [75]:
#Verify dups removed
check_dups = df_non_dups[df_non_dups.duplicated(subset=["date", "zipcode"], keep=False)]
check_dups

Unnamed: 0,date,cloud_cover,humidity,precipitation,pressure,temperature,temperature_min,temperature_max,wind_speed,wind_direction,zipcode
