In [1]:
# packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
csv_path = 'resources/weather_burbank_airport.csv'
df = pd.read_csv(csv_path)

Get a basic overview over the data:

In [3]:
print(df.head(4))
print(df.shape)

      city            timestamp  temperature  cloud_cover  \
0  Burbank  2018-01-01 08:53:00          9.0         33.0   
1  Burbank  2018-01-01 09:53:00          9.0         33.0   
2  Burbank  2018-01-01 10:53:00          9.0         21.0   
3  Burbank  2018-01-01 11:53:00          9.0         29.0   

  cloud_cover_description  pressure  windspeed  precipitation  \
0                    Fair    991.75        9.0            0.0   
1                    Fair    992.08        0.0            0.0   
2                    Haze    992.08        0.0            0.0   
3           Partly Cloudy    992.08        0.0            0.0   

   felt_temperature  
0               8.0  
1               9.0  
2               9.0  
3               9.0  
(29244, 9)


First, we check for duplicates:

In [4]:
df.duplicated().sum()

0

Apparently there are none. We continue with checking if there are missing values:

In [5]:
df.isnull().sum()

city                        0
timestamp                   0
temperature                25
cloud_cover                20
cloud_cover_description    20
pressure                    8
windspeed                  86
precipitation               0
felt_temperature           26
dtype: int64

Compared to the size of the full dataset the amount of missing values is negligible and we dont lose much if we just drop the corresponding rows

In [6]:
df = df.dropna()


Now we check the datatypes and transform into appropriate datatypes, if necessary:

In [7]:
df.dtypes

city                        object
timestamp                   object
temperature                float64
cloud_cover                float64
cloud_cover_description     object
pressure                   float64
windspeed                  float64
precipitation              float64
felt_temperature           float64
dtype: object

This will be our changes:  

    timestamp -> timestamp(datetime)
    temp -> float
    cloud_cover ->float
    cloud_cover_description -> one-hot-encoding


In [8]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['cloud_cover'] = df['cloud_cover'].astype(float)
df = pd.get_dummies(df, columns=['cloud_cover_description'])

After inspecting data, we decided for two last changes:  

    Drop city column, since there is only one unique entry  

    cloud_cover_description has a lot of possible values, we neglect those, where the corresponding value occurs less than 300 times in the dataset. but we dont want to drop them, since other columns still may contain valuable information. thus we summarize the corresponding entries with a column 'cloud_cover_to_neglect'

In [9]:
df = df.drop(columns=['city'])


In [10]:
prefix = 'cloud_cover_description_'
cloud_cover_cols = [col for col in df.columns if col.startswith(prefix)]

category_counts = df[cloud_cover_cols].sum()

# Step 3: Identify Low-Frequency Categories (<300)
threshold = 300
categories_to_neglect = category_counts[category_counts < threshold].index.tolist()

# Step 4: Flag Rows with Low-Frequency Categories
mask_to_neglect = df[categories_to_neglect].any(axis=1)
df['cloud_cover_to_neglect'] = mask_to_neglect.astype(int)

# Step 5: (Optional) Remove Low-Frequency One-Hot Columns
# Uncomment the following lines if you wish to drop the low-frequency columns
# df = df.drop(columns=categories_to_neglect)

# Step 6: Verify the New Column
print("Distribution of 'cloud_cover_to_neglect':")
print(df['cloud_cover_to_neglect'].value_counts())

print("\nSample DataFrame with 'cloud_cover_to_neglect':")
print(df[['cloud_cover_to_neglect'] + cloud_cover_cols].head())


Distribution of 'cloud_cover_to_neglect':
0    28566
1      539
Name: cloud_cover_to_neglect, dtype: int64

Sample DataFrame with 'cloud_cover_to_neglect':
   cloud_cover_to_neglect  cloud_cover_description_Blowing Dust  \
0                       0                                     0   
1                       0                                     0   
2                       0                                     0   
3                       0                                     0   
4                       0                                     0   

   cloud_cover_description_Cloudy  cloud_cover_description_Cloudy / Windy  \
0                               0                                       0   
1                               0                                       0   
2                               0                                       0   
3                               0                                       0   
4                               0                       

In [11]:
df.head(15)

Unnamed: 0,timestamp,temperature,cloud_cover,pressure,windspeed,precipitation,felt_temperature,cloud_cover_description_Blowing Dust,cloud_cover_description_Cloudy,cloud_cover_description_Cloudy / Windy,...,cloud_cover_description_Mostly Cloudy / Windy,cloud_cover_description_Partly Cloudy,cloud_cover_description_Partly Cloudy / Windy,cloud_cover_description_Rain,cloud_cover_description_Rain / Windy,cloud_cover_description_Smoke,cloud_cover_description_T-Storm,cloud_cover_description_Thunder,cloud_cover_description_Thunder in the Vicinity,cloud_cover_to_neglect
0,2018-01-01 08:53:00,9.0,33.0,991.75,9.0,0.0,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018-01-01 09:53:00,9.0,33.0,992.08,0.0,0.0,9.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-01-01 10:53:00,9.0,21.0,992.08,0.0,0.0,9.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-01-01 11:53:00,9.0,29.0,992.08,0.0,0.0,9.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2018-01-01 12:53:00,8.0,33.0,992.08,0.0,0.0,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2018-01-01 13:53:00,8.0,33.0,992.08,0.0,0.0,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2018-01-01 14:53:00,7.0,30.0,992.08,0.0,0.0,7.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,2018-01-01 15:53:00,8.0,34.0,992.41,0.0,0.0,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2018-01-01 16:53:00,12.0,34.0,993.39,0.0,0.0,12.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2018-01-01 17:53:00,16.0,34.0,994.05,0.0,0.0,16.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
