In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/59.0 kB 1.4 MB/s eta 0:00:01
     -------------------------------------- 59.0/59.0 kB 786.1 kB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.3 MB 2.6 MB/s eta 0:00:04
    --------------------------------------- 0.2/9.3 MB 2.0 MB/s eta 0:00:05
   - -------------------------------------- 0.

In [2]:
# Part 0: Setup
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os
import csv

# File paths
input_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\Feni Solar and Wind 2017-2019.csv"
step1_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step1_no_nan_columns.csv"
step2_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step2_time_utc.csv"
step3_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step3_scaled.csv"
step4_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step4_unscaled.csv"

columns_to_scale = [
    'DHI_ThPyra2_Wm-2_avg', 'DNI_ThPyrh1_Wm-2_avg', 'GHI_ThPyra1_Wm-2_avg',
    'Precip_Pluvio1_mm_sum', 'Pres_Logger1_hPa_avg', 'RH_ThHyg1_per100_avg',
    'Temp_Logger1_degC_avg', 'Temp_RefCell1_degC_avg', 'Temp_RefCell2_degC_avg',
    'Temp_RefCell3_degC_avg', 'Temp_ThHyg1_degC_avg', 'Temp_ThPyra1_degC_avg',
    'Temp_ThPyra2_degC_avg', 'Temp_ThPyrh1_degC_avg', 'WindDir_Wvane1_deg_avg360',
    'WindSpeed_Anemo1_ms_avg', 'WindSpeed_Anemo1_ms_max', 'WindSpeed_Anemo2_ms_avg',
    'WindSpeed_Anemo2_ms_max', 'GTI_RefCell1_Wm-2_avg', 'GTI_RefCell2_Wm-2_avg',
    'GTI_RefCell3_Wm-2_avg'
]

print("Setup complete.")


Setup complete.


In [47]:
# Part 1: Remove a single column while keeping comment lines
print("Running Part 1: Removing single column...")

# Read CSV as text
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Separate comment lines and data
comment_lines = [line for line in lines if line.startswith("#")]
data_lines = [line for line in lines if not line.startswith("#")]

# Find header from comment line
header_line = None
for line in comment_lines:
    if line.startswith("# JulianTime"):
        header_line = line.lstrip("#").strip()
        break

if header_line is None:
    raise ValueError("Cannot find header line.")

columns = header_line.split(",")

# Load dataframe using these columns
from io import StringIO
data_str = "".join(data_lines)
df = pd.read_csv(StringIO(data_str), names=columns, header=None)

# Column to remove
col_to_remove = "DHI_ThPyrh2_Wm-2_avg_flag"

# Remove column from dataframe if it exists
if col_to_remove in df.columns:
    df = df.drop(columns=[col_to_remove])
    print(f"Removed '{col_to_remove}' from dataframe.")
else:
    print(f"Column '{col_to_remove}' not found in dataframe.")

# Remove column from comment lines
new_comment_lines = []
for line in comment_lines:
    new_line = line.replace(col_to_remove + ",", "")
    new_line = new_line.replace("," + col_to_remove, "")
    new_line = new_line.replace(col_to_remove, "")
    new_comment_lines.append(new_line)

# Save back: comment lines + dataframe
with open(step1_file, "w", encoding="utf-8") as f:
    for line in new_comment_lines:
        f.write(line)
    df.to_csv(f, index=False, header=False)

print(f"Part 1 complete. Saved to: {step1_file}")


Running Part 1: Removing single column...
Removed 'DHI_ThPyrh2_Wm-2_avg_flag' from dataframe.
Part 1 complete. Saved to: D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step1_no_nan_columns.csv


In [3]:
# Part 2: Convert JulianTime to Unix timestamp
print("Running Part 2: Converting JulianTime to Unix timestamp...")

import pandas as pd
from io import StringIO

# Column to convert
time_col = "JulianTime"

# Read CSV as text
with open(step1_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Separate comment lines and data
comment_lines = [line for line in lines if line.startswith("#")]
data_lines = [line for line in lines if not line.startswith("#")]

# Extract header
header_line = None
for line in comment_lines:
    if line.startswith("# JulianTime"):
        header_line = line.lstrip("#").strip()
        break

if header_line is None:
    raise ValueError("Cannot find header line.")

columns = header_line.split(",")

# Load dataframe
data_str = "".join(data_lines)
df = pd.read_csv(StringIO(data_str), names=columns, header=None)

# Convert JulianTime to Unix timestamp
if time_col in df.columns:
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce', utc=True)
    df[time_col] = df[time_col].astype("int64") // 10**9  # convert ns to seconds
    print(f"Converted '{time_col}' to Unix timestamp.")
else:
    raise ValueError(f"Column '{time_col}' not found in dataframe.")

# Save back to CSV: comments + updated dataframe
with open(step2_file, "w", encoding="utf-8") as f:
    for line in comment_lines:
        f.write(line)
    df.to_csv(f, index=False, header=False)

print(f"Part 2 complete. Saved to: {step2_file}")


Running Part 2: Converting JulianTime to Unix timestamp...
Converted 'JulianTime' to Unix timestamp.
Part 2 complete. Saved to: D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step2_time_utc.csv


In [50]:
# Part 3: Min-Max Scaling numeric columns
print("Running Part 3: Min-Max scaling numeric columns...")

# Read CSV as text
with open(step2_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Separate comment lines and data
comment_lines = [line for line in lines if line.startswith("#")]
data_lines = [line for line in lines if not line.startswith("#")]

# Extract header from comment line
header_line = None
for line in comment_lines:
    if line.startswith("# JulianTime"):
        header_line = line.lstrip("#").strip()
        break

columns = header_line.split(",")

# Load dataframe
data_str = "".join(data_lines)
df = pd.read_csv(StringIO(data_str), names=columns, header=None)

# Columns to exclude from scaling
exclude_cols = [col for col in df.columns if col == "JulianTime" or "_flag" in col or col == "data-filled"]

# Columns to scale
scale_cols = [col for col in df.columns if col not in exclude_cols]

# Apply Min-Max scaling
scaler = MinMaxScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Save back: comments + scaled dataframe
with open(step3_file, "w", encoding="utf-8") as f:
    for line in comment_lines:
        f.write(line)
    df.to_csv(f, index=False, header=False)

print(f"Part 3 complete. Saved to: {step3_file}")


Running Part 3: Min-Max scaling numeric columns...
Part 3 complete. Saved to: D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step3_scaled.csv


In [38]:
# Part 4: Min-max scale specific columns
def scale_columns(input_path, output_path, columns_to_scale):
    """Apply min-max scaling to specified columns"""
    df = pd.read_csv(input_path)
    
    # Initialize scaler
    scaler = MinMaxScaler()
    
    # Scale specified columns
    scaled_columns = []
    for col in columns_to_scale:
        if col in df.columns:
            # Store original min and max for unscaling
            col_min = df[col].min()
            col_max = df[col].max()
            
            # Scale the column
            df[col] = scaler.fit_transform(df[[col]])
            scaled_columns.append(col)
            print(f"Scaled column: {col} (min: {col_min}, max: {col_max})")
        else:
            print(f"Warning: Column {col} not found in dataset")
    
    if not scaled_columns:
        print("No columns were scaled")
        return df, {}
    
    # Save result
    df.to_csv(output_path, index=False)
    print(f"Saved to {output_path}")
    
    # Also save the scaler information for later unscaling
    scaler_info = {}
    for col in scaled_columns:
        scaler_info[col] = {
            'data_min': df[col].min(),
            'data_max': df[col].max(),
            'feature_range': (0, 1)  # MinMaxScaler default range
        }
    
    # Save scaler info to a separate file
    scaler_info_path = output_path.replace('.csv', '_scaler_info.csv')
    pd.DataFrame(scaler_info).to_csv(scaler_info_path)
    print(f"Scaler info saved to {scaler_info_path}")
    
    return df, scaler_info

In [52]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from io import StringIO

# File paths
input_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\step2_time_utc.csv"
numeric_csv_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\cuda_ready.csv"
numeric_npy_file = r"D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\cuda_ready.npy"

# 1️⃣ Load CSV (skip comment lines)
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

comment_lines = [line for line in lines if line.startswith("#")]
data_lines = [line for line in lines if not line.startswith("#")]

# Extract header from comments
header_line = None
for line in comment_lines:
    if line.startswith("# JulianTime"):
        header_line = line.lstrip("#").strip()
        break

columns = header_line.split(",")
data_str = "".join(data_lines)
df = pd.read_csv(StringIO(data_str), names=columns, header=None)

# 2️⃣ Convert JulianTime to numeric: minutes since first row
time_col = "JulianTime"
df[time_col] = pd.to_datetime(df[time_col], utc=True)
start_time = df[time_col].iloc[0]
df[time_col] = df[time_col].apply(lambda x: (x - start_time).total_seconds() / 60.0)  # minutes since start

# 3️⃣ Separate numeric vs. flag columns
numeric_cols = [col for col in df.columns if "_flag" not in col and col != time_col]
flag_cols = [col for col in df.columns if "_flag" in col]

# Ensure correct dtypes
df[numeric_cols] = df[numeric_cols].astype(np.float32)
df[flag_cols] = df[flag_cols].astype(np.int32)

# 4️⃣ Min-max scale ONLY numeric (not flags, not JulianTime)
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print(f"Scaled numeric columns: {len(numeric_cols)} | Kept flag columns: {len(flag_cols)}")

# 5️⃣ Save numeric CSV
df.to_csv(numeric_csv_file, index=False)
print(f"Saved numeric CSV to: {numeric_csv_file}")

# 6️⃣ Save NumPy binary for fast GPU loading
np.save(numeric_npy_file, df.to_numpy(dtype=np.float32))
print(f"Saved numeric NumPy array to: {numeric_npy_file}")


Scaled numeric columns: 25 | Kept flag columns: 23
Saved numeric CSV to: D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\cuda_ready.csv
Saved numeric NumPy array to: D:\Yüksek Lisans\datasets\Feni Solar-Wind Data\cuda_ready.npy
