In [None]:
! pip install ucimlrepo

In [2]:
import pandas as pd
import numpy as np

In [15]:
output_loc = "/repos/smote_msfb/public_datasets/solar/"

In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
solar_flare = fetch_ucirepo(id=89) 
  
# data (as pandas dataframes) 
X = solar_flare.data.features 
y = solar_flare.data.targets 

In [None]:
# metadata 
print(yeast.metadata) 
  
# variable information 
print(yeast.variables) 

In [5]:
X.dtypes

modified Zurich class              object
largest spot size                  object
spot distribution                  object
activity                            int64
evolution                           int64
previous 24 hour flare activity     int64
historically-complex                int64
became complex on this pass         int64
area                                int64
area of largest spot                int64
dtype: object

In [6]:
X.head(3)

Unnamed: 0,modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot
0,C,S,O,1,2,1,1,2,1,2
1,D,S,O,1,3,1,1,2,1,2
2,C,S,O,1,3,1,1,2,1,1


In [7]:
# Identify continuous and categorical columns
continuous_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 1. Binarize continuous variables
X_bin = X.copy()
for col in continuous_cols:
    cutoff = X_bin[col].quantile(0.65)
    X_bin[col] = (X_bin[col] > cutoff).astype(int)

# 2. One-hot encode categorical columns
X_bin = pd.get_dummies(X_bin, columns=categorical_cols, drop_first=False)

X_bin.head(3)

Unnamed: 0,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot,modified Zurich class_B,modified Zurich class_C,modified Zurich class_D,...,largest spot size_A,largest spot size_H,largest spot size_K,largest spot size_R,largest spot size_S,largest spot size_X,spot distribution_C,spot distribution_I,spot distribution_O,spot distribution_X
0,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


In [8]:
# Step 3: Rename all columns as f_*
X_bin.columns = [f'f_{i}' for i in range(X_bin.shape[1])]

In [9]:
X_bin.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22
0,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0


In [10]:
y.value_counts()

common flares  moderate flares  severe flares
0              0                0                1130
1              0                0                 123
2              0                0                  31
0              1                0                  30
3              0                0                  18
1              1                0                  11
4              0                0                   7
2              1                0                   5
0              2                0                   4
1              0                1                   4
6              0                0                   3
5              0                0                   3
0              1                1                   3
3              1                0                   2
2              2                0                   2
               4                0                   2
5              1                0                   1
4              4                0   

In [12]:
# Create binary response: 1 if 'moderate flares' >= 1, else 0
target = (y['moderate flares'] >= 1).astype(int).rename('target')

# Check the distribution to confirm imbalance
print(target.value_counts())

# If you want to append this target to your feature DataFrame (e.g., X_encoded or X_bin):
final_df = pd.concat([X_bin.reset_index(drop=True), target.reset_index(drop=True)], axis=1)

0    1321
1      68
Name: target, dtype: int64


In [13]:
final_df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,target
0,0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [16]:
final_df.to_csv(output_loc + "final_data.csv.zip", 
                     index=False, 
                     compression={'method': 'zip', 'archive_name': 'final_data.csv'})