In [2]:
import pandas as pd

file_path = 'weather.csv'
weather_data = pd.read_csv(file_path)

In [3]:
# Handling missing values
weather_data.fillna(method='ffill', inplace=True) 

In [4]:
# Handling outliers
outlier_threshold = 3  
numeric_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                    'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                    'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                    'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']

for column in numeric_columns:
    # Calculate IQR for the column
    Q1 = weather_data[column].quantile(0.25)
    Q3 = weather_data[column].quantile(0.75)
    IQR = Q3 - Q1

    # Remove outliers based on IQR
    weather_data[column] = weather_data[column].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR)

In [5]:
# Convert categorical variables to numerical representations
weather_data['RainToday'] = weather_data['RainToday'].map({'No': 0, 'Yes': 1})
weather_data['RainTomorrow'] = weather_data['RainTomorrow'].map({'No': 0, 'Yes': 1})

In [6]:
print(weather_data)

     MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
0        8.0     24.3       0.0          3.4       6.3          NW   
1       14.0     26.9       0.5          4.4       9.7         ENE   
2       13.7     23.4       0.5          5.8       3.3          NW   
3       13.3     15.5       0.5          7.2       9.1          NW   
4        7.6     16.1       0.5          5.6      10.6         SSE   
..       ...      ...       ...          ...       ...         ...   
361      9.0     30.7       0.0          7.6      12.1         NNW   
362      7.1     28.4       0.0         11.6      12.7           N   
363     12.5     19.9       0.0          8.4       5.3         ESE   
364     12.5     26.9       0.0          5.0       7.1          NW   
365     12.3     30.2       0.0          6.0      12.6          NW   

     WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0             30.0         SW         NW           6.0  ...         29.0   
1      

In [7]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler

# # Extract numerical columns for scaling
# numeric_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
#                     'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
#                     'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
#                     'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']

# # Initialize the scaler
# scaler = StandardScaler()

# # Scale numerical features
# weather_data[numeric_columns] = scaler.fit_transform(weather_data[numeric_columns])

In [8]:
# Perform one-hot encoding for categorical variables
weather_data = pd.get_dummies(weather_data, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)

In [9]:
import numpy as np

# Apply log transformation to skewed columns
skewed_columns = ['Rainfall', 'Evaporation', 'Sunshine', 'WindSpeed9am', 'WindSpeed3pm',
                  'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
                  'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RISK_MM']

weather_data[skewed_columns] = weather_data[skewed_columns].apply(lambda x: np.log1p(x))

In [10]:
print(weather_data)

     MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
0        8.0     24.3  0.000000     1.481605  1.987874           30.0   
1       14.0     26.9  0.405465     1.686399  2.370244           39.0   
2       13.7     23.4  0.405465     1.916923  1.458615           68.5   
3       13.3     15.5  0.405465     2.104134  2.312535           54.0   
4        7.6     16.1  0.405465     1.887070  2.451005           50.0   
..       ...      ...       ...          ...       ...            ...   
361      9.0     30.7  0.000000     2.151762  2.572612           68.5   
362      7.1     28.4  0.000000     2.533697  2.617396           48.0   
363     12.5     19.9  0.000000     2.240710  1.840550           43.0   
364     12.5     26.9  0.000000     1.791759  2.091864           46.0   
365     12.3     30.2  0.000000     1.945910  2.610070           68.5   

     WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  \
0        1.945910      3.044522     4.234107     3.401197

In [11]:
output_file_path = 'preprocessed_weather_data.csv'
weather_data.to_csv(output_file_path, index=False)