In [3]:
import pandas as pd
import numpy as np
import random

# Generate example data
np.random.seed(0)
dates = pd.date_range('2006-04-01', periods=100, freq='H')
formatted_dates = [date.strftime('%Y-%m-%d %H:%M:%S.000 +0200') for date in dates]

# Generate weather conditions and daily summaries for each date
weather_conditions = ['Partly cloudy', 'Sunny', 'Rainy', 'Cloudy']
weather_conditions = [random.choice(weather_conditions) for _ in range(100)]

daily_summary = [' '.join([random.choice(weather_conditions), 'throughout the day.']) for _ in range(100)]

temperature = np.random.randint(50, 100, size=100)
humidity = np.random.randint(40, 90, size=100)
wind_speed = np.random.randint(0, 15, size=100)
pressure = np.random.randint(980, 1050, size=100)
visibility = np.random.randint(0, 15, size=100)
apparent_temperature = np.random.randint(50, 100, size=100)

# Create DataFrame
df = pd.DataFrame({
    'Formatted Date': formatted_dates,
    'Weather Conditions': weather_conditions,
    'Temperature (C)': temperature,
    'Humidity': humidity,
    'Wind Speed (km/h)': wind_speed,
    'Pressure (mbar)': pressure,
    'Visibility (km)': visibility,
    'Apparent Temperature (C)': apparent_temperature,
    'Daily Summary': daily_summary
})
df.head()


Unnamed: 0,Formatted Date,Weather Conditions,Temperature (C),Humidity,Wind Speed (km/h),Pressure (mbar),Visibility (km),Apparent Temperature (C),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly cloudy,94,45,8,1016,2,80,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Cloudy,97,81,8,1028,13,58,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Rainy,50,75,9,1005,7,70,Rainy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly cloudy,53,40,2,1047,8,57,Sunny throughout the day.
4,2006-04-01 04:00:00.000 +0200,Sunny,53,71,8,1015,4,53,Cloudy throughout the day.


In [5]:
from sklearn.preprocessing import PowerTransformer

In [6]:
# Compute variance of the original 'Temperature (C)' column
original_variance = df['Temperature (C)'].var()

# Apply power transform to 'Temperature (C)' column
pt = PowerTransformer(method='yeo-johnson')
df['Temperature (C)'] = pt.fit_transform(df[['Temperature (C)']])

# Compute variance of the transformed 'Temperature (C)' column
transformed_variance = df['Temperature (C)'].var()

print("Original Variance:", original_variance)
print("Transformed Variance:", transformed_variance)

Original Variance: 217.82828282828282
Transformed Variance: 1.0101010101010097


In [8]:
from statsmodels.tsa.stattools import adfuller

# Apply difference transform to 'Humidity' column
df['Humidity difference'] = df['Humidity'].diff()


# Perform Dickey-Fuller test for stationarity
result = adfuller(df['Humidity difference'].dropna())
print("Humidity difference ADF Statistic:", result[0])
print("Humidity difference p-value:", result[1])
print("Humidity difference Critical Values:")
for key, value in result[4].items():
    print(f"   {key}: {value}")


Humidity difference ADF Statistic: -6.594772523405528
Humidity difference p-value: 6.969838186303788e-09
Humidity difference Critical Values:
   1%: -3.50434289821397
   5%: -2.8938659630479413
   10%: -2.5840147047458037


In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the 'Humidity' and 'Pressure (mbar)' columns
df['Humidity standardized'] = scaler.fit_transform(df[['Humidity']])
df['Pressure standardized'] = scaler.fit_transform(df[['Pressure (mbar)']])

# Display the transformed DataFrame
print(df[['Humidity standardized', 'Pressure standardized']].head())


   Humidity standardized  Pressure standardized
0              -1.264019               0.045409
1               1.151303               0.664619
2               0.748750              -0.522201
3              -1.599480               1.645036
4               0.480381              -0.006192


In [10]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the 'Humidity' column
df['Humidity normalized'] = scaler.fit_transform(df[['Humidity']])

# Display the transformed DataFrame
print(df.head())


                  Formatted Date Weather Conditions  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200      Partly cloudy         1.399284   
1  2006-04-01 01:00:00.000 +0200             Cloudy         1.568565   
2  2006-04-01 02:00:00.000 +0200              Rainy        -1.644508   
3  2006-04-01 03:00:00.000 +0200      Partly cloudy        -1.390345   
4  2006-04-01 04:00:00.000 +0200              Sunny        -1.390345   

   Humidity  Wind Speed (km/h)  Pressure (mbar)  Visibility (km)  \
0        45                  8             1016                2   
1        81                  8             1028               13   
2        75                  9             1005                7   
3        40                  2             1047                8   
4        71                  8             1015                4   

   Apparent Temperature (C)                      Daily Summary  \
0                        80  Partly cloudy throughout the day.   
1                        5