Step1: for this assignment you can provide your own data. The data must be pulled/scraped from online resources (as covered in other courses) to obtain full credit of this step. For example, you can scrape tweeter data or Kijiji data etc.

In [62]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


### Step 1

In [42]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1548892800,2019-01-31 00:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-17.57,1207.0,-21.6,-24.57,...,20.06,,,,,40,600,Snow,light snow,13n
1,1548896400,2019-01-31 01:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-18.42,402.0,-22.58,-25.42,...,19.0,,,,,40,600,Snow,light snow,13n
2,1548900000,2019-01-31 02:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.24,3218.0,-23.07,-26.24,...,20.6,,,,,100,600,Snow,light snow,13n
3,1548903600,2019-01-31 03:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.4,10000.0,-23.83,-26.4,...,18.0,,,,,20,801,Clouds,few clouds,02n
4,1548907200,2019-01-31 04:00:00 +0000 UTC,-18000,Toronto,43.653226,-79.383184,-19.71,10000.0,-24.44,-26.71,...,17.0,,,,,0,800,Clear,sky is clear,01n


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44256 entries, 0 to 44255
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dt                   44256 non-null  int64  
 1   dt_iso               44256 non-null  object 
 2   timezone             44256 non-null  int64  
 3   city_name            44256 non-null  object 
 4   lat                  44256 non-null  float64
 5   lon                  44256 non-null  float64
 6   temp                 44256 non-null  float64
 7   visibility           35094 non-null  float64
 8   dew_point            44256 non-null  float64
 9   feels_like           44256 non-null  float64
 10  temp_min             44256 non-null  float64
 11  temp_max             44256 non-null  float64
 12  pressure             44256 non-null  int64  
 13  sea_level            0 non-null      float64
 14  grnd_level           0 non-null      float64
 15  humidity             44256 non-null 

### Step 2

Data Cleaning

In [44]:
#Dropping Irrelevant Columns
df.drop(columns=['sea_level', 'grnd_level'], inplace=True)
# Drop timezone columns
df.drop(columns=['timezone'], inplace=True)
# Drop location columns as all the data is for the same location
df.drop(columns=['city_name', 'lat', 'lon', 'weather_id', 'weather_icon'], inplace=True)

In [45]:
# Count missing values per column
missing_values = df.isnull().sum()
print(missing_values)

dt                         0
dt_iso                     0
temp                       0
visibility              9162
dew_point                  0
feels_like                 0
temp_min                   0
temp_max                   0
pressure                   0
humidity                   0
wind_speed                 0
wind_deg                   0
wind_gust              23716
rain_1h                36961
rain_3h                43924
snow_1h                42993
snow_3h                44206
clouds_all                 0
weather_main               0
weather_description        0
dtype: int64


In [46]:
# Fill rain and snow columns with 0 since it likely means no rain/snow occurred.
df['rain_1h'] = df['rain_1h'].fillna(0)
df['rain_3h'] = df['rain_3h'].fillna(0)
df['snow_1h'] = df['snow_1h'].fillna(0)
df['snow_3h'] = df['snow_3h'].fillna(0)

# For wind gust, fill with 0 assuming no gusts if no data was recorded
df['wind_gust'] = df['wind_gust'].fillna(0)  # or df['wind_gust'].interpolate(method='linear')

# For visibility fill with median 
df['visibility'] = df['visibility'].fillna(df['visibility'].median())


Formatting and Structuring

In [47]:
# Renaming Columns to make column names more descriptive and standardized.

df.rename(columns={
    'main.temp': 'temp',
    'main.feels_like': 'feels_like_temp',
    'main.pressure': 'pressure',
    'main.humidity': 'humidity',
    'main.temp_min': 'temp_min',
    'main.temp_max': 'temp_max',
    'clouds.all': 'cloud_coverage',
    'weather.main': 'weather_type',
    'weather.description': 'weather_description'
}, inplace=True)

Validation

In [49]:
# wind speed should not be negative, and humidity should range between 0 and 100.
assert df['humidity'].between(0, 100).all(), "Invalid humidity values"


In [50]:
assert (df['wind_speed'] >= 0).all(), "Negative wind speed values"

In [51]:
# Data Consistency: Ensure that related columns are consistent.the temp_min column should always be less than or equal to temp_max.
df = df[df['temp_min'] <= df['temp_max']]

Feature Enginneering

In [54]:
df['dt_iso'] = pd.to_datetime(df['dt_iso'], format='%Y-%m-%d %H:%M:%S +0000 UTC')  # Specify format
df['year'] = df['dt_iso'].dt.year
df['month'] = df['dt_iso'].dt.month
df['day'] = df['dt_iso'].dt.day
df['hour'] = df['dt_iso'].dt.hour

In [59]:
# Drop the 'dt' and 'weather_description' columns
df.drop(columns=['dt', 'weather_description', 'dt_iso'], inplace=True)

Normalization

In [63]:
# List of numerical columns to normalize
num_cols = [
    'temp', 'visibility', 'dew_point', 'feels_like',
    'temp_min', 'temp_max', 'pressure', 'humidity',
    'wind_speed', 'wind_deg', 'wind_gust', 
    'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
    'clouds_all', 'year', 'month', 'day'
]

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the numerical features
df[num_cols] = scaler.fit_transform(df[num_cols])

In [65]:
# Encode categorical data
df = pd.get_dummies(df, columns=['weather_main'], drop_first=True)

In [66]:
df.head()

Unnamed: 0,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,...,month,day,hour,weather_main_Clouds,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Snow,weather_main_Thunderstorm
0,0.053697,0.120612,0.099453,0.045545,0.081729,0.053928,0.552239,0.623529,0.690741,0.666667,...,0.0,1.0,0,False,False,False,False,False,True,False
1,0.038329,0.040104,0.080305,0.03251,0.064228,0.035952,0.567164,0.611765,0.712963,0.694444,...,0.0,1.0,1,False,False,False,False,False,True,False
2,0.023504,0.321732,0.070731,0.019936,0.046727,0.02265,0.567164,0.635294,0.787037,0.666667,...,0.0,1.0,2,False,False,False,False,False,True,False
3,0.020611,1.0,0.055881,0.017482,0.046727,0.017976,0.567164,0.588235,0.712963,0.694444,...,0.0,1.0,3,True,False,False,False,False,False,False
4,0.015006,1.0,0.043962,0.012728,0.029226,0.017976,0.567164,0.564706,0.62037,0.694444,...,0.0,1.0,4,False,False,False,False,False,False,False
