# PREPROCESSING DATA
## 1. Missing values
## 2. Outliers

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff

pio.renderers.default = "vscode" 

In [2]:
data = pd.read_csv("weather_data.csv")

In [3]:
data.head()

Unnamed: 0,City,Year,Month,Max Temperature,Avg Temperature,Min Temperature,Dew Point,Precipitation,Snowdepth,Wind,Gust Wind,Sea Level Pressure
0,Barcelona,2013,January,58.26,50.55,44.61,38.14,0.0,0.0,9.79,0.86,29.95
1,Munich,2013,January,35.81,32.02,26.61,28.17,0.0,0.0,8.59,1.61,28.38
2,Lisbon,2013,January,58.23,53.68,49.87,48.03,0.0,0.0,6.85,11.34,29.87
3,Athens,2013,January,56.97,49.75,41.26,38.46,0.0,0.0,8.09,0.85,29.59
4,Prague,2013,January,33.32,29.84,25.9,25.0,0.0,0.0,9.95,0.62,28.61


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10783 entries, 0 to 10782
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   City                10783 non-null  object 
 1   Year                10783 non-null  int64  
 2   Month               10783 non-null  object 
 3   Max Temperature     8694 non-null   float64
 4   Avg Temperature     8694 non-null   float64
 5   Min Temperature     8694 non-null   float64
 6   Dew Point           8694 non-null   float64
 7   Precipitation       8694 non-null   float64
 8   Snowdepth           8694 non-null   float64
 9   Wind                8694 non-null   float64
 10  Gust Wind           8694 non-null   float64
 11  Sea Level Pressure  8694 non-null   float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1011.0+ KB


In [5]:
data.describe()

Unnamed: 0,Year,Max Temperature,Avg Temperature,Min Temperature,Dew Point,Precipitation,Snowdepth,Wind,Gust Wind,Sea Level Pressure
count,10783.0,8694.0,8694.0,8694.0,8694.0,8694.0,8694.0,8694.0,8694.0,8694.0
mean,2018.029213,73.350536,65.183983,56.520622,52.639294,0.008515,0.00079,7.934275,1.282349,28.926942
std,3.167959,16.709957,16.010861,15.722277,15.262473,0.041427,0.017355,2.494362,2.361955,2.476377
min,2013.0,11.48,3.27,-5.17,-6.75,0.0,0.0,0.78,0.0,0.0
25%,2015.0,62.9475,54.68,45.6625,41.9,0.0,0.0,6.25,0.06,29.25
50%,2018.0,75.94,66.94,56.97,52.775,0.0,0.0,7.71,0.32,29.69
75%,2021.0,86.6575,78.67,69.365,64.6875,0.0,0.0,9.41,1.31,29.87
max,2023.0,112.75,100.84,94.47,80.78,0.7,0.72,19.28,17.12,30.43


In [6]:
data.dtypes

City                   object
Year                    int64
Month                  object
Max Temperature       float64
Avg Temperature       float64
Min Temperature       float64
Dew Point             float64
Precipitation         float64
Snowdepth             float64
Wind                  float64
Gust Wind             float64
Sea Level Pressure    float64
dtype: object

# **Duplicate Values**

In [7]:
data.duplicated().sum()

501

In [8]:
data = data.drop_duplicates()

In [9]:
data.duplicated().sum()

0

# **FILLING MISSING VALUES**

In [10]:
data.isnull().sum()

City                     0
Year                     0
Month                    0
Max Temperature       2043
Avg Temperature       2043
Min Temperature       2043
Dew Point             2043
Precipitation         2043
Snowdepth             2043
Wind                  2043
Gust Wind             2043
Sea Level Pressure    2043
dtype: int64

In [11]:
data.shape

(10282, 12)

In [12]:
for column in data.select_dtypes(include=['float']):
    fig = px.histogram(
        data, 
        x=column, 
        nbins=30, 
        title=f"Distribution of {column}", 
        labels={column: column, 'count': 'Count'}
    )
    fig.update_layout(
        xaxis_title=column,
        yaxis_title='Count',
        template='plotly_dark'
    )
    fig.show()


In [13]:
skewness = data.select_dtypes(include=['float']).skew()
print(skewness)

Max Temperature       -0.544310
Avg Temperature       -0.467828
Min Temperature       -0.308807
Dew Point             -0.233307
Precipitation          7.866226
Snowdepth             26.464129
Wind                   0.438768
Gust Wind              3.170674
Sea Level Pressure    -6.812398
dtype: float64


1. sea level - median
2. gust wind - median
3. snowdepth - median
4. precipitation - median

In [14]:
for column in data.select_dtypes(include=['float']).columns:

  if abs(skewness[column]) > 0.5: # means that data is highly skewed, so we replace it with median
    data[column] = data[column].fillna(data[column].median())

  else: # data moderately skewed and somewhat symmetric so we'll replace it with mean
    data[column] = data[column].fillna(data[column].mean())


In [15]:
data.isnull().sum()

City                  0
Year                  0
Month                 0
Max Temperature       0
Avg Temperature       0
Min Temperature       0
Dew Point             0
Precipitation         0
Snowdepth             0
Wind                  0
Gust Wind             0
Sea Level Pressure    0
dtype: int64

# **DETECTING OUTLIERS**

In [16]:
for column in data.select_dtypes(include=['float']):
    fig = px.box(
        data, 
        y=column, 
        title=f'Boxplot of {column}',
        template='plotly_dark',
        labels={column: column, 'y': 'Value'}
    )
    fig.update_layout(
        yaxis_title=column,
        title_font_size=16
    )
    fig.show()


In [17]:
for column in data.select_dtypes(include=['float']):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    print(f"Number of outliers in {column}: {len(outliers)}")

Number of outliers in Max Temperature: 326
Number of outliers in Avg Temperature: 195
Number of outliers in Min Temperature: 145
Number of outliers in Dew Point: 160
Number of outliers in Precipitation: 601
Number of outliers in Snowdepth: 28
Number of outliers in Wind: 630
Number of outliers in Gust Wind: 1304
Number of outliers in Sea Level Pressure: 1747


#### Since our project is focused on global warming trends, the presence of outliers in the data could represent extreme weather events or unusual environmental conditions, which might be very important for understanding the impact of global warming. So we are gonna keep the outliers.

##### creating a date column using year and month

In [18]:
month_mapping = {
    "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6,
    "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12
}

data['Month'] = data['Month'].map(month_mapping)

In [19]:
data['Date'] = pd.to_datetime(data[['Year', 'Month']].assign(Day=1))

data.sort_values('Date', inplace=True)

In [20]:
data.head()

Unnamed: 0,City,Year,Month,Max Temperature,Avg Temperature,Min Temperature,Dew Point,Precipitation,Snowdepth,Wind,Gust Wind,Sea Level Pressure,Date
0,Barcelona,2013,1,58.26,50.55,44.61,38.14,0.0,0.0,9.79,0.86,29.95,2013-01-01
59,Nairobi,2013,1,78.52,68.86,56.71,56.54,0.0,0.0,6.81,0.0,24.47,2013-01-01
57,Lima,2013,1,76.03,65.290147,56.635367,52.916315,0.0,0.0,7.9219,0.32,29.7,2013-01-01
56,Dubai,2013,1,76.61,69.46,62.61,53.27,0.0,0.0,7.8,0.11,30.04,2013-01-01
55,Madrid,2013,1,50.74,42.35,33.65,30.48,0.0,0.0,7.31,1.88,28.08,2013-01-01


#### encoding of cities

In [21]:
label_encoder = LabelEncoder()
data['City_Encoded'] = label_encoder.fit_transform(data['City'])
data.drop(columns='City', inplace=True)

In [22]:
data.head()

Unnamed: 0,Year,Month,Max Temperature,Avg Temperature,Min Temperature,Dew Point,Precipitation,Snowdepth,Wind,Gust Wind,Sea Level Pressure,Date,City_Encoded
0,2013,1,58.26,50.55,44.61,38.14,0.0,0.0,9.79,0.86,29.95,2013-01-01,9
59,2013,1,78.52,68.86,56.71,56.54,0.0,0.0,6.81,0.0,24.47,2013-01-01,52
57,2013,1,76.03,65.290147,56.635367,52.916315,0.0,0.0,7.9219,0.32,29.7,2013-01-01,38
56,2013,1,76.61,69.46,62.61,53.27,0.0,0.0,7.8,0.11,30.04,2013-01-01,25
55,2013,1,50.74,42.35,33.65,30.48,0.0,0.0,7.31,1.88,28.08,2013-01-01,42


In [23]:
data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
data.drop('Month', axis=1, inplace=True)

In [24]:
data.head()

Unnamed: 0,Year,Max Temperature,Avg Temperature,Min Temperature,Dew Point,Precipitation,Snowdepth,Wind,Gust Wind,Sea Level Pressure,Date,City_Encoded,Month_sin,Month_cos
0,2013,58.26,50.55,44.61,38.14,0.0,0.0,9.79,0.86,29.95,2013-01-01,9,0.5,0.866025
59,2013,78.52,68.86,56.71,56.54,0.0,0.0,6.81,0.0,24.47,2013-01-01,52,0.5,0.866025
57,2013,76.03,65.290147,56.635367,52.916315,0.0,0.0,7.9219,0.32,29.7,2013-01-01,38,0.5,0.866025
56,2013,76.61,69.46,62.61,53.27,0.0,0.0,7.8,0.11,30.04,2013-01-01,25,0.5,0.866025
55,2013,50.74,42.35,33.65,30.48,0.0,0.0,7.31,1.88,28.08,2013-01-01,42,0.5,0.866025


### Correlation of features

In [25]:
corr_cols = ['Avg Temperature', 'Precipitation', 'Snowdepth', 'Dew Point', 'Sea Level Pressure', 'Max Temperature', 'Min Temperature', 'Month_sin', 'Month_cos']
correlation_matrix = data[corr_cols].corr()
print(correlation_matrix)

fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,  
    x=corr_cols,  
    y=corr_cols,  
    colorscale='RdBu', 
    showscale=True,  
    hoverinfo='z',  
    annotation_text=correlation_matrix.round(2).values, 
    font_colors=['white'] * len(corr_cols) 
)

fig.update_layout(
    title='Correlation Between Climate Variables',
    xaxis_title='Climate Variables',
    yaxis_title='Climate Variables',
    xaxis=dict(tickmode='array', tickvals=list(range(len(corr_cols))), ticktext=corr_cols),
    yaxis=dict(tickmode='array', tickvals=list(range(len(corr_cols))), ticktext=corr_cols),
    template='plotly_dark'
)

fig.show()


                    Avg Temperature  Precipitation  Snowdepth  Dew Point  \
Avg Temperature            1.000000       0.013151  -0.086368   0.872517   
Precipitation              0.013151       1.000000   0.103998   0.022234   
Snowdepth                 -0.086368       0.103998   1.000000  -0.096660   
Dew Point                  0.872517       0.022234  -0.096660   1.000000   
Sea Level Pressure        -0.013261      -0.018253   0.018508   0.100949   
Max Temperature            0.986938      -0.002614  -0.090476   0.826020   
Min Temperature            0.965688       0.036983  -0.078854   0.883640   
Month_sin                 -0.234209      -0.009184   0.037843  -0.255950   
Month_cos                 -0.312805       0.010628   0.035539  -0.251300   

                    Sea Level Pressure  Max Temperature  Min Temperature  \
Avg Temperature              -0.013261         0.986938         0.965688   
Precipitation                -0.018253        -0.002614         0.036983   
Snowdepth  

In [26]:
data.to_csv('cleaned_data.csv', index=False)