In [1]:
import pandas as pd
import numpy as np
from hmm import HiddenMarkovModel

In [35]:
data = pd.read_csv('solar_weather.csv')
data.head()

Unnamed: 0,Time,Energy delta[Wh],GHI,temp,pressure,humidity,wind_speed,rain_1h,snow_1h,clouds_all,isSun,sunlightTime,dayLength,SunlightTime/daylength,weather_type,hour,month
0,2017-01-01 00:00:00,0,0.0,1.6,1021,100,4.9,0.0,0.0,100,0,0,450,0.0,4,0,1
1,2017-01-01 00:15:00,0,0.0,1.6,1021,100,4.9,0.0,0.0,100,0,0,450,0.0,4,0,1
2,2017-01-01 00:30:00,0,0.0,1.6,1021,100,4.9,0.0,0.0,100,0,0,450,0.0,4,0,1
3,2017-01-01 00:45:00,0,0.0,1.6,1021,100,4.9,0.0,0.0,100,0,0,450,0.0,4,0,1
4,2017-01-01 01:00:00,0,0.0,1.7,1020,100,5.2,0.0,0.0,100,0,0,450,0.0,4,1,1


In [36]:
data.drop(columns=['Time'],inplace=True)

In [37]:
# correlation of data with weather_type in decreasing order
data.corr()['weather_type'].sort_values(ascending=False)

weather_type              1.000000
clouds_all                0.910932
rain_1h                   0.330671
humidity                  0.286863
wind_speed                0.193744
snow_1h                   0.143205
month                     0.047874
SunlightTime/daylength    0.034283
isSun                     0.024333
sunlightTime              0.006140
hour                     -0.013900
temp                     -0.088711
GHI                      -0.163616
Energy delta[Wh]         -0.170046
dayLength                -0.194644
pressure                 -0.319925
Name: weather_type, dtype: float64

It can be interpreted that clouds,rain,humidity and pressure have the highest correlation with weather type column

In [38]:
# use only clouds,rain,humidity and pressure column in data
data = data[['clouds_all', 'humidity', 'pressure', 'weather_type']]

In [39]:
ranges = data[['clouds_all', 'humidity', 'pressure', 'weather_type']].agg(['min', 'max'])
print(ranges)


     clouds_all  humidity  pressure  weather_type
min           0        22       977             1
max         100       100      1047             5


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196776 entries, 0 to 196775
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   clouds_all    196776 non-null  int64
 1   humidity      196776 non-null  int64
 2   pressure      196776 non-null  int64
 3   weather_type  196776 non-null  int64
dtypes: int64(4)
memory usage: 6.0 MB


In [41]:
# Define bins and labels for each column
clouds_bins = [-1, 20, 50, 80, 100]
clouds_labels = ['Clear', 'Partly Cloudy', 'Mostly Cloudy', 'Overcast']

humidity_bins = [21, 40, 60, 80, 100]
humidity_labels = ['Low', 'Moderate', 'High', 'Very High']

pressure_bins = [976, 990, 1010, 1030, 1047]
pressure_labels = ['Low', 'Normal', 'High', 'Very High']



# Create new labeled columns
data['clouds_all_label'] = pd.cut(data['clouds_all'], bins=clouds_bins, labels=clouds_labels, right=True)
data['humidity_label'] = pd.cut(data['humidity'], bins=humidity_bins, labels=humidity_labels, right=True)
data['pressure_label'] = pd.cut(data['pressure'], bins=pressure_bins, labels=pressure_labels, right=True)
# Example: Mapping textual weather_type to A, B, C, D, E
weather_mapping = {
    1: 'A',
    2: 'B',
    3: 'C',
    4: 'D',
    5: 'E'
}

data['weather_type'] = data['weather_type'].map(weather_mapping)


In [42]:
data.drop(columns=['clouds_all', 'humidity', 'pressure'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196776 entries, 0 to 196775
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   weather_type      196776 non-null  object  
 1   clouds_all_label  196776 non-null  category
 2   humidity_label    196776 non-null  category
 3   pressure_label    196776 non-null  category
dtypes: category(3), object(1)
memory usage: 2.1+ MB


In [43]:
# Extract unique elements as lists
weather = data['weather_type'].unique().tolist()
clouds = data['clouds_all_label'].unique().tolist()
humidity = data['humidity_label'].unique().tolist()
pressure = data['pressure_label'].unique().tolist()

# Print the unique elements
print("Weather Types:", weather)
print("Clouds Labels:", clouds)
print("Humidity Labels:", humidity)
print("Pressure Labels:", pressure)

Weather Types: ['D', 'E', 'C', 'B', 'A']
Clouds Labels: ['Overcast', 'Mostly Cloudy', 'Partly Cloudy', 'Clear']
Humidity Labels: ['Very High', 'High', 'Moderate', 'Low']
Pressure Labels: ['High', 'Normal', 'Low', 'Very High']


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196776 entries, 0 to 196775
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   weather_type      196776 non-null  object  
 1   clouds_all_label  196776 non-null  category
 2   humidity_label    196776 non-null  category
 3   pressure_label    196776 non-null  category
dtypes: category(3), object(1)
memory usage: 2.1+ MB


In [46]:
hidden_col = 'weather_type'
observed_col = 'clouds_all_label'

# Split data into training and testing sets
split_index = int(len(data) * 0.95)
train_data = data.iloc[:split_index]  # First 95% for training
test_data = data.iloc[split_index:]  # Last 5% for testing

# Extract observations and hidden states
train_observations = train_data[observed_col].values
train_hidden_states = train_data[hidden_col].values

test_observations = test_data[observed_col].values
test_hidden_states = test_data[hidden_col].values

# Map unique values to indices
hidden_states = np.unique(train_hidden_states)
observations = np.unique(train_observations)

hidden_map = {hidden: idx for idx, hidden in enumerate(hidden_states)}
observation_map = {obs: idx for idx, obs in enumerate(observations)}

train_obs_idx = [observation_map[obs] for obs in train_observations]
train_hidden_idx = [hidden_map[state] for state in train_hidden_states]

# HMM parameters
N = len(hidden_states)  # Number of hidden states
M = len(observations)   # Number of observations

# Initialize HMM parameters
start_probability = np.ones(N) / N
transition_probability = np.ones((N, N)) / N
emission_probability = np.zeros((N, M))

# Populate emission probabilities
for i in range(len(train_obs_idx)):
    emission_probability[train_hidden_idx[i], train_obs_idx[i]] += 1

# Normalize emission probabilities
emission_probability = emission_probability / (emission_probability.sum(axis=1, keepdims=True) + 1e-12)

# Create and train the HMM
hmm = HiddenMarkovModel(hidden_states, observations, start_probability, transition_probability, emission_probability)
iterations = 1
hmm.BaumWelchAlgorithm(train_observations, iterations)

# Test the model
test_obs_idx = [observation_map.get(obs, -1) for obs in test_observations]
test_obs_idx = [idx if idx != -1 else np.random.choice(M) for idx in test_obs_idx]  # Handle unseen observations

test_predicted_states = hmm.predict(test_observations)
test_predicted_states = [hidden_states[i] for i in test_predicted_states]

# Calculate accuracy
accuracy = np.sum(np.array(test_predicted_states) == test_hidden_states) / len(test_hidden_states)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")
print("Predicted states for test data:", test_predicted_states)


Accuracy on test data: 10.41%
Predicted states for test data: ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A