In [None]:
import pandas as pd

# 1) Load the big dataset
df = pd.read_csv("EcoSphereAI_dataset_150.csv")

print("Loaded dataset shape:", df.shape)
print(df.head(5))


Loaded dataset shape: (150, 9)
   greywater_usage_liters  co2_levels_ppm  humidity_percentage  \
0                     202             361                   47   
1                     535             515                   76   
2                     960             592                   44   
3                     370             398                   69   
4                     206             471                   42   

   temperature_celsius  sunlight_hours_average  building_surface_area_sqm  \
0                   19                     8.9                         74   
1                   11                     4.9                         27   
2                   26                     5.5                         58   
3                   17                     4.9                         20   
4                   10                     6.4                         94   

   wind_speed_kmh   algae_type     wall_design  
0            12.4  Scenedesmus  Vertical Tower  
1          

In [None]:
# 2) Columns for input
input_columns = [
    "greywater_usage_liters",
    "co2_levels_ppm",
    "humidity_percentage",
    "temperature_celsius",
    "sunlight_hours_average",
    "building_surface_area_sqm",
    "wind_speed_kmh"
]

# 3) Dataset 1: Predict 'wall_design'
wall_design_df = df[input_columns + ["wall_design"]]

# 4) Dataset 2: Predict 'algae_type'
algae_type_df = df[input_columns + ["algae_type"]]

print("Wall design dataset shape:", wall_design_df.shape)
print("Algae type dataset shape:", algae_type_df.shape)

# 5) Save them to new CSV files
wall_design_df.to_csv("WallDesign_data.csv", index=False)
algae_type_df.to_csv("AlgaeType_data.csv", index=False)

print("✅ Split into 'WallDesign_data.csv' and 'AlgaeType_data.csv' successfully!")


Wall design dataset shape: (150, 8)
Algae type dataset shape: (150, 8)
✅ Split into 'WallDesign_data.csv' and 'AlgaeType_data.csv' successfully!


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

df_wall = pd.read_csv("WallDesign_data.csv")
print(df_wall.head())


   greywater_usage_liters  co2_levels_ppm  humidity_percentage  \
0                     202             361                   47   
1                     535             515                   76   
2                     960             592                   44   
3                     370             398                   69   
4                     206             471                   42   

   temperature_celsius  sunlight_hours_average  building_surface_area_sqm  \
0                   19                     8.9                         74   
1                   11                     4.9                         27   
2                   26                     5.5                         58   
3                   17                     4.9                         20   
4                   10                     6.4                         94   

   wind_speed_kmh     wall_design  
0            12.4  Vertical Tower  
1            14.7         Tubular  
2            17.4  Vertical Towe

In [None]:
X = df_wall.drop("wall_design", axis=1)  # all input columns
y = df_wall["wall_design"]              # target is wall_design


In [None]:
import pandas as pd

# 1) Load the wall design dataset
wall_df = pd.read_csv("WallDesign_data.csv")
print("Wall Design Dataset:", wall_df.shape)
print(wall_df.head())

# 2) Load the algae type dataset
algae_df = pd.read_csv("AlgaeType_data.csv")
print("Algae Type Dataset:", algae_df.shape)
print(algae_df.head())


Wall Design Dataset: (150, 8)
   greywater_usage_liters  co2_levels_ppm  humidity_percentage  \
0                     202             361                   47   
1                     535             515                   76   
2                     960             592                   44   
3                     370             398                   69   
4                     206             471                   42   

   temperature_celsius  sunlight_hours_average  building_surface_area_sqm  \
0                   19                     8.9                         74   
1                   11                     4.9                         27   
2                   26                     5.5                         58   
3                   17                     4.9                         20   
4                   10                     6.4                         94   

   wind_speed_kmh     wall_design  
0            12.4  Vertical Tower  
1            14.7         Tubular  
2 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_wall = wall_df.drop("wall_design", axis=1)
y_wall = wall_df["wall_design"]



In [None]:
Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wall, y_wall, test_size=0.2, random_state=42
)
model_wall = RandomForestClassifier(n_estimators=300, random_state=42)
model_wall.fit(Xw_train, yw_train)
score_wall = model_wall.score(Xw_test, yw_test)
print("Wall design model accuracy:", score_wall)


Wall design model accuracy: 0.2


In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import joblib

# 1) Load your dataset
df = pd.read_csv("WallDesign_data.csv")

# 2) Separate features & target
X_wall = df.drop("wall_design", axis=1)
y_wall = df["wall_design"]

# 3) Train/test split
Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wall, y_wall, test_size=0.2, random_state=42
)

# 4) Check for class imbalance
print("Before oversampling, training set class distribution:")
print(yw_train.value_counts())

# 5) Oversample minority classes using RandomOverSampler
ros = RandomOverSampler(random_state=42)
Xw_train_ros, yw_train_ros = ros.fit_resample(Xw_train, yw_train)

print("\nAfter oversampling, training set class distribution:")
print(yw_train_ros.value_counts())

# 6) Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rfc = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(Xw_train_ros, yw_train_ros)

print("\nBest Params found:", grid_search.best_params_)

# 7) Evaluate best model
best_model = grid_search.best_estimator_
score_wall = best_model.score(Xw_test, yw_test)
print(f"Wall design model accuracy after oversampling & tuning: {score_wall:.3f}")

# 8) Check classification report
y_pred = best_model.predict(Xw_test)
print("\nClassification Report:")
print(classification_report(yw_test, y_pred))

# 9) (Optional) Save the best model
joblib.dump(best_model, "wall_design_model_best.pkl")
print("✅ Model saved as wall_design_model_best.pkl!")




Before oversampling, training set class distribution:
wall_design
Flat Panel         37
Tubular            36
Vertical Tower     27
Modular Hexagon    20
Name: count, dtype: int64

After oversampling, training set class distribution:
wall_design
Tubular            37
Flat Panel         37
Modular Hexagon    37
Vertical Tower     37
Name: count, dtype: int64

Best Params found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Wall design model accuracy after oversampling & tuning: 0.133

Classification Report:
                 precision    recall  f1-score   support

     Flat Panel       0.00      0.00      0.00         3
Modular Hexagon       0.00      0.00      0.00        11
        Tubular       0.40      0.44      0.42         9
 Vertical Tower       0.00      0.00      0.00         7

       accuracy                           0.13        30
      macro avg       0.10      0.11      0.11        30
   weighted avg       0.12      0.13      0.13        30

✅ Model sa

In [None]:

import pandas as pd
import numpy as np

# We want 150 rows
num_rows = 150
np.random.seed(42)

# Step 1: Randomly generate environment data
greywater_usage_liters = np.random.randint(100, 1000, num_rows)
co2_levels_ppm = np.random.randint(300, 700, num_rows)
humidity_percentage = np.random.randint(20, 80, num_rows)
temperature_celsius = np.random.randint(10, 35, num_rows)
sunlight_hours_average = np.random.uniform(3, 12, num_rows).round(1)
building_surface_area_sqm = np.random.randint(10, 100, num_rows)
wind_speed_kmh = np.random.uniform(5, 25, num_rows).round(1)

# Step 2: Prepare placeholders for target columns
algae_type = []
wall_design = []

# Step 3: Define logic-based assignment
for i in range(num_rows):
    temp = temperature_celsius[i]
    humid = humidity_percentage[i]
    co2 = co2_levels_ppm[i]
    wind = wind_speed_kmh[i]
    area = building_surface_area_sqm[i]

    # Assign ALGAE TYPE based on temp & humidity
    if temp > 30 and humid > 60:
        # Warm + humid => likely Spirulina
        chosen_algae = "Spirulina"
    elif temp < 20 and humid < 50:
        # Cool + dry => either Ulva or Dunaliella
        chosen_algae = np.random.choice(["Ulva", "Dunaliella"])
    else:
        # moderate conditions => Chlorella or Scenedesmus
        chosen_algae = np.random.choice(["Chlorella", "Scenedesmus"])

    algae_type.append(chosen_algae)

    # Assign WALL DESIGN based on wind speed & CO2
    if wind > 20 and co2 > 550:
        chosen_design = "Tubular"
    elif wind < 10 and co2 < 450:
        chosen_design = "Flat Panel"
    elif area < 30:
        chosen_design = "Vertical Tower"
    else:
        chosen_design = "Modular Hexagon"

    wall_design.append(chosen_design)

# Create DataFrame
df = pd.DataFrame({
    "greywater_usage_liters": greywater_usage_liters,
    "co2_levels_ppm": co2_levels_ppm,
    "humidity_percentage": humidity_percentage,
    "temperature_celsius": temperature_celsius,
    "sunlight_hours_average": sunlight_hours_average,
    "building_surface_area_sqm": building_surface_area_sqm,
    "wind_speed_kmh": wind_speed_kmh,
    "algae_type": algae_type,
    "wall_design": wall_design
})

# Save to CSV
csv_filename = "EcoSphereAI_dataset_150_logic.csv"
df.to_csv(csv_filename, index=False)

print(f"✅ Generated {num_rows} rows with logic-based assignments → {csv_filename}")
print(df.head(15))


✅ Generated 150 rows with logic-based assignments → EcoSphereAI_dataset_150_logic.csv
    greywater_usage_liters  co2_levels_ppm  humidity_percentage  \
0                      202             361                   47   
1                      535             515                   76   
2                      960             592                   44   
3                      370             398                   69   
4                      206             471                   42   
5                      171             659                   50   
6                      800             513                   49   
7                      120             334                   61   
8                      714             526                   54   
9                      221             400                   26   
10                     566             430                   79   
11                     314             556                   35   
12                     430             304 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# 1️⃣ Load the dataset
df = pd.read_csv("algae-sustainability-dataset.csv")

# 2️⃣ Encode categorical columns
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == "object":
        df[column] = label_encoder.fit_transform(df[column])

# 3️⃣ Separate features and target
X = df.drop("algae_type", axis=1)
y = df["algae_type"]

# 4️⃣ Show original class distribution
print("Original class distribution:", Counter(y))

# 5️⃣ Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 6️⃣ Show new balanced class distribution
print("Balanced class distribution:", Counter(y_resampled))

# 7️⃣ Split the balanced data for training
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

print("✅ Dataset cleaned, encoded, balanced, and split successfully!")



Original class distribution: Counter({0: 898, 2: 57, 1: 45})
Balanced class distribution: Counter({0: 898, 2: 898, 1: 898})
✅ Dataset cleaned, encoded, balanced, and split successfully!


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# 1️⃣ Load the dataset
df = pd.read_csv("algae-sustainability-dataset.csv")

# 2️⃣ Encode categorical columns
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == "object":
        df[column] = label_encoder.fit_transform(df[column])

# 3️⃣ Separate features and target
X = df.drop("algae_type", axis=1)
y = df["algae_type"]

# 4️⃣ Show original class distribution
print("Original class distribution:", Counter(y))

# 5️⃣ Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 6️⃣ Show new balanced class distribution
print("Balanced class distribution:", Counter(y_resampled))

# 7️⃣ Combine balanced data into a new DataFrame
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df["algae_type"] = y_resampled

# 8️⃣ Save the cleaned dataset
balanced_df.to_csv("cleaned_balanced_algae_dataset.csv", index=False)

print("✅ Cleaned dataset saved as 'cleaned_balanced_algae_dataset.csv'!")


Original class distribution: Counter({0: 898, 2: 57, 1: 45})
Balanced class distribution: Counter({0: 898, 2: 898, 1: 898})
✅ Cleaned dataset saved as 'cleaned_balanced_algae_dataset.csv'!


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# 1️⃣ Load the dataset
df = pd.read_csv("algae-sustainability-dataset.csv")

# 2️⃣ Encode only feature columns (leave the target as is)
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == "object" and column != "algae_type":
        df[column] = label_encoder.fit_transform(df[column])

# 3️⃣ Separate features and target
X = df.drop("algae_type", axis=1)
y = df["algae_type"]

# 4️⃣ Show original class distribution
print("Original class distribution:", Counter(y))

# 5️⃣ Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 6️⃣ Show new balanced class distribution
print("Balanced class distribution:", Counter(y_resampled))

# 7️⃣ Combine balanced data into a new DataFrame
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df["algae_type"] = y_resampled  # Keep original algae type names

# 8️⃣ Save the cleaned dataset
balanced_df.to_csv("cleaned_balanced_algae_dataset.csv", index=False)

print("Cleaned dataset saved as 'cleaned_balanced_algae_dataset.csv'!")





Original class distribution: Counter({'Chlorella': 898, 'Ulva': 57, 'Spirulina': 45})
Balanced class distribution: Counter({'Chlorella': 898, 'Ulva': 898, 'Spirulina': 898})
Cleaned dataset saved as 'cleaned_balanced_algae_dataset.csv'!


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cleaned_balanced_algae_dataset (1).csv')

# Check the distribution of algae types
print(df['algae_type'].value_counts())


algae_type
Chlorella    898
Ulva         898
Spirulina    898
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(['algae_type', 'wall_design'], axis=1)  # Inputs
y_algae = df['algae_type']  # Target 1 - Algae type
y_wall = df['wall_design']  # Target 2 - Wall design

# Split for algae type prediction
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_algae, test_size=0.2, random_state=42)

# Split for wall design prediction
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X, y_wall, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Algae type prediction model
algae_model = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42)
algae_model.fit(X_train_a, y_train_a)

# Wall design prediction model
wall_model = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42)
wall_model.fit(X_train_w, y_train_w)

# Check accuracy for both models
algae_preds = algae_model.predict(X_test_a)
wall_preds = wall_model.predict(X_test_w)

print("Algae Type Model Accuracy:", accuracy_score(y_test_a, algae_preds))
print("Wall Design Model Accuracy:", accuracy_score(y_test_w, wall_preds))


Algae Type Model Accuracy: 1.0
Wall Design Model Accuracy: 0.9387755102040817


In [None]:
import pandas as pd

df = pd.read_csv('cleaned_balanced_algae_dataset.csv')

# Count unique values per target
print("Algae Type Distribution:")
print(df['algae_type'].value_counts())
print("\nWall Design Distribution:")
print(df['wall_design'].value_counts())


Algae Type Distribution:
algae_type
Chlorella    898
Ulva         898
Spirulina    898
Name: count, dtype: int64

Wall Design Distribution:
wall_design
1    1595
0     570
2     279
3     250
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv('algae_weather_greywater_dataset.csv')

# Display basic info
print("🔹 Dataset Info:")
print(data.info())
print("\n🔹 First 5 rows:")
print(data.head())

# Check class distribution
print("\n🔹 Algae Type Distribution:")
print(data['algae_type'].value_counts())

print("\n🔹 Wall Design Distribution:")
print(data['wall_design'].value_counts())

# Check for missing values
print("\n🔹 Missing Values:")
print(data.isnull().sum())


🔹 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   temperature     5000 non-null   float64
 1   humidity        5000 non-null   float64
 2   sunlight_hours  5000 non-null   float64
 3   rainfall        5000 non-null   float64
 4   ph              5000 non-null   float64
 5   nitrogen        5000 non-null   float64
 6   phosphorus      5000 non-null   float64
 7   bod             5000 non-null   float64
 8   algae_type      5000 non-null   object 
 9   algae_encoded   5000 non-null   int64  
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB
None

🔹 First 5 rows:
   temperature   humidity  sunlight_hours  rainfall        ph  nitrogen  \
0    26.839287  51.316993        7.911123  9.497806  4.500000  8.694790   
1    22.003827  49.433191        5.885964  8.958699  7.137713  7.669985   
2    27.878600  65.997958 

KeyError: 'wall_design'

In [None]:
import pandas as pd
import numpy as np

# Define algae types and their optimal temperature ranges
algae_info = {
    'Haematococcus': {'opt_temp': (20, 25), 'opt_sunlight': (6, 8)},
    'Chlorella': {'opt_temp': (25, 30), 'opt_sunlight': (5, 7)},
    'Spirulina': {'opt_temp': (35, 37), 'opt_sunlight': (8, 10)},
    'Scenedesmus': {'opt_temp': (30, 35), 'opt_sunlight': (6, 9)},
    'Dunaliella': {'opt_temp': (25, 35), 'opt_sunlight': (7, 9)}
}

# Define panel types
panel_types = ['Flat Panel', 'Tubular Photobioreactor', 'Vertical Column', 'Open Pond']

# Function to determine the optimal panel type
def determine_panel_type(algae, temp, sunlight):
    opt_temp = algae_info[algae]['opt_temp']
    opt_sunlight = algae_info[algae]['opt_sunlight']

    if opt_temp[0] <= temp <= opt_temp[1] and opt_sunlight[0] <= sunlight <= opt_sunlight[1]:
        return 'Flat Panel'
    elif temp > opt_temp[1] and sunlight > opt_sunlight[1]:
        return 'Open Pond'
    elif temp < opt_temp[0] and sunlight < opt_sunlight[0]:
        return 'Vertical Column'
    else:
        return 'Tubular Photobioreactor'

# Generate dataset
np.random.seed(42)
data = []
for _ in range(2000):
    algae = np.random.choice(list(algae_info.keys()))
    temp = np.random.uniform(15, 40)  # Temperature range
    sunlight = np.random.uniform(4, 12)  # Sunlight hours range
    wall_size = np.random.uniform(10, 100)  # Wall size in m²
    algae_amount = np.random.uniform(0.1, 2.0) * wall_size  # Biomass in kg
    panel_type = determine_panel_type(algae, temp, sunlight)
    data.append([algae, algae_amount, wall_size, temp, sunlight, panel_type])

# Create DataFrame
df = pd.DataFrame(data, columns=['Algae Type', 'Algae Amount (kg)', 'Wall Size (m²)', 'Temperature (°C)', 'Sunlight Hours', 'Optimal Panel Type'])

# Save to CSV
df.to_csv('algae_panel_dataset.csv', index=False)


In [None]:
import pandas as pd
import numpy as np

# Define algae types with optimal conditions
algae_info = {
    'Haematococcus': {'opt_temp': (20, 25), 'opt_sunlight': (6, 8)},
    'Chlorella': {'opt_temp': (25, 30), 'opt_sunlight': (5, 7)},
    'Spirulina': {'opt_temp': (35, 37), 'opt_sunlight': (8, 10)},
    'Scenedesmus': {'opt_temp': (30, 35), 'opt_sunlight': (6, 9)},
    'Dunaliella': {'opt_temp': (25, 35), 'opt_sunlight': (7, 9)}
}

# Define four panel types
panel_types = ['Flat Panel', 'Tubular Photobioreactor', 'Vertical Column', 'Spiral Coil']

# Determine the best panel type scientifically
def determine_panel_type(algae, temp, sunlight):
    opt_temp = algae_info[algae]['opt_temp']
    opt_sunlight = algae_info[algae]['opt_sunlight']

    if opt_temp[0] <= temp <= opt_temp[1] and opt_sunlight[0] <= sunlight <= opt_sunlight[1]:
        return 'Flat Panel'
    elif temp > opt_temp[1] and sunlight > opt_sunlight[1]:
        return 'Tubular Photobioreactor'
    elif temp < opt_temp[0] and sunlight < opt_sunlight[0]:
        return 'Vertical Column'
    else:
        return 'Spiral Coil'

# Generate the dataset
np.random.seed(42)
data = []
for _ in range(2000):
    algae = np.random.choice(list(algae_info.keys()))
    temp = np.random.uniform(15, 40)  # Temperature range
    sunlight = np.random.uniform(4, 12)  # Sunlight hours range
    wall_size = np.random.uniform(10, 100)  # Wall size in m²
    algae_amount = np.random.uniform(0.1, 2.0) * wall_size  # Biomass in kg
    panel_type = determine_panel_type(algae, temp, sunlight)
    data.append([algae, algae_amount, wall_size, temp, sunlight, panel_type])

# Create DataFrame
df = pd.DataFrame(data, columns=['Algae Type', 'Algae Amount (kg)', 'Wall Size (m²)', 'Temperature (°C)', 'Sunlight Hours', 'Optimal Panel Type'])

# Save to CSV
df.to_csv('algae_panel_dataset_v2.csv', index=False)

print("✅ Dataset generated successfully!")


✅ Dataset generated successfully!


In [None]:
import pandas as pd
import numpy as np

# Algae species information
algae_info = {
    'Haematococcus': {'opt_temp': (20, 25), 'opt_sunlight': (6, 8), 'growth_rate': 0.05},
    'Chlorella': {'opt_temp': (25, 30), 'opt_sunlight': (5, 7), 'growth_rate': 0.10},
    'Spirulina': {'opt_temp': (35, 37), 'opt_sunlight': (8, 10), 'growth_rate': 0.12},
    'Scenedesmus': {'opt_temp': (30, 35), 'opt_sunlight': (6, 9), 'growth_rate': 0.08},
    'Dunaliella': {'opt_temp': (25, 35), 'opt_sunlight': (7, 9), 'growth_rate': 0.06}
}

# Panel types
panel_types = ['Flat Panel', 'Tubular', 'Vertical Column', 'V-Shaped']

# Function to determine optimal panel type
def determine_panel_type(algae, temp, sunlight):
    opt_temp = algae_info[algae]['opt_temp']
    opt_sunlight = algae_info[algae]['opt_sunlight']

    if opt_temp[0] <= temp <= opt_temp[1] and opt_sunlight[0] <= sunlight <= opt_sunlight[1]:
        return 'Flat Panel'
    elif temp > opt_temp[1] and sunlight > opt_sunlight[1]:
        return 'Tubular'
    elif temp < opt_temp[0] and sunlight < opt_sunlight[0]:
        return 'Vertical Column'
    else:
        return 'V-Shaped'

# Generate dataset
np.random.seed(42)
data = []
for _ in range(2000):
    algae = np.random.choice(list(algae_info.keys()))
    temp = np.random.uniform(15, 40)
    sunlight = np.random.uniform(4, 12)
    wall_size = np.random.uniform(10, 100)
    growth_rate = algae_info[algae]['growth_rate']
    algae_amount = growth_rate * wall_size * sunlight  # Simplified biomass estimation
    panel_type = determine_panel_type(algae, temp, sunlight)
    data.append([algae, algae_amount, wall_size, temp, sunlight, panel_type])

# Create DataFrame
df = pd.DataFrame(data, columns=['Algae Type', 'Algae Amount (kg)', 'Wall Size (m²)', 'Temperature (°C)', 'Sunlight Hours', 'Optimal Panel Type'])

# Save to CSV
df.to_csv('scientifically_backed_algae_dataset.csv', index=False)

print("Dataset generated successfully!")


Dataset generated successfully!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
df_p = pd.read_csv('final_scientifically_backed_algae_dataset.csv')  # algae_type, algae_amount, wall_size, temp, sunlight, panel_type

Xp = df_p.drop(['Optimal Panel Type','Algae Type','Algae Type Encoded','Optimal Panel Type Encoded'], axis=1)
yp = df_p['Optimal Panel Type']
smote = SMOTE(random_state=42)
# SMOTE if needed
Xp_bal, yp_bal = smote.fit_resample(Xp, yp)

Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp_bal, yp_bal, test_size=0.2, random_state=42)

panel_model = LGBMClassifier(n_estimators=200, random_state=42)
panel_model.fit(Xp_train, yp_train)

yp_pred = panel_model.predict(Xp_test)
print("\nPanel Model Accuracy:", accuracy_score(yp_test, yp_pred))
print(classification_report(yp_test, yp_pred))

joblib.dump(panel_model, "panel_model.pkl")
print("✅ panel_model.pkl saved")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 4070, number of used features: 4
[LightGBM] [Info] Start training from score -1.390727
[LightGBM] [Info] Start training from score -1.385803
[LightGBM] [Info] Start training from score -1.389740
[LightGBM] [Info] Start training from score -1.378950

Panel Model Accuracy: 0.918467583497053
                 precision    recall  f1-score   support

     Flat Panel       0.90      0.98      0.94       259
        Tubular       0.92      0.93      0.92       254
       V-Shaped       0.91      0.79      0.85       258
Vertical Column       0.94      0.98      0.96       247

       accuracy                           0.92      1018
      macro avg       0.92      0.92      0.92      1018
   weighted avg       0.92      0.92      0.92      101

      Algae Type  Algae Amount (kg)  Wall Size (m²)  Temperature (°C)  \
0    Scenedesmus          50.367274       63.879264         38.767858   
1      Chlorella          39.269376       87.955853         18.899863   
2    Scenedesmus          11.105276       15.077042         18.571670   
3    Scenedesmus          31.825185       99.299040         38.463818   
4  Haematococcus          20.034009       48.875052         22.606056   

   Sunlight Hours Optimal Panel Type  
0        9.855952            Tubular  
1        4.464669    Vertical Column  
2        9.207108           V-Shaped  
3        4.006230           V-Shaped  
4        8.198051           V-Shaped  


KeyError: 'algae_type'

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('enhanced_algae_panel_dataset')

# Display the first few rows and column names
print(data.head())
print(data.columns)
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
encoder = LabelEncoder()

# Replace 'algae_type' with the correct column name
data['correct_column_name_for_algae'] = encoder.fit_transform(data['correct_column_name_for_algae'])
data['correct_column_name_for_panel'] = encoder.fit_transform(data['correct_column_name_for_panel'])

# Confirm data is now numeric
print(data.head())


      Algae Type  Algae Amount (kg)  Wall Size (m²)  Temperature (°C)  \
0    Scenedesmus          50.367274       63.879264         38.767858   
1      Chlorella          39.269376       87.955853         18.899863   
2    Scenedesmus          11.105276       15.077042         18.571670   
3    Scenedesmus          31.825185       99.299040         38.463818   
4  Haematococcus          20.034009       48.875052         22.606056   

   Sunlight Hours Optimal Panel Type  
0        9.855952            Tubular  
1        4.464669    Vertical Column  
2        9.207108           V-Shaped  
3        4.006230           V-Shaped  
4        8.198051           V-Shaped  
Index(['Algae Type', 'Algae Amount (kg)', 'Wall Size (m²)', 'Temperature (°C)',
       'Sunlight Hours', 'Optimal Panel Type'],
      dtype='object')


KeyError: 'correct_column_name_for_algae'