In [1]:
!pip install pandas numpy scikit-learn fastapi uvicorn joblib

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

# Data Processing


In [3]:
def load_data():
    df = pd.read_csv("algae-sustainability-dataset.csv")
    return df

In [4]:
def clean_data(df):
    df.dropna(inplace=True)  # Remove missing values
    df.drop_duplicates(inplace=True)  # Remove duplicates
    return df

In [5]:
df = load_data()
df = clean_data(df)
print(df.head())

   greywater_usage_liters  co2_levels_ppm  humidity_percentage  \
0                     202             637                   30   
1                     535             585                   52   
2                     960             634                   25   
3                     370             646                   66   
4                     206             607                   34   

   temperature_celsius  sunlight_hours_average  building_surface_area_sqm  \
0                   29                     8.9                         22   
1                   29                    10.6                         36   
2                   13                     9.2                         16   
3                   15                     6.9                         31   
4                   16                     9.1                         83   

   wind_speed_kmh  wall_design algae_type  
0             7.9            3  Chlorella  
1             5.4            1  Chlorella  
2       

In [6]:
# Split the data into features (X) and target (y)
X = df.drop(["algae_type", "wall_design"], axis=1)
y = df[['algae_type','wall_design']]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# RF model


In [8]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'int' and 'str'

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy_algae_type = accuracy_score(y_test['algae_type'], y_pred[:, 0])
accuracy_wall_design = accuracy_score(y_test['wall_design'], y_pred[:, 1])
print("Accuracy of algae type:", accuracy_algae_type)
print("Accuracy for wall_design",accuracy_wall_design)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('algae_data.csv')

# Separate features and target variables
X = df[['greywater_usage_liters', 'co2_levels_ppm', 'humidity_percentage', 
        'temperature_celsius', 'sunlight_hours_average', 
        'building_surface_area_sqm', 'wind_speed_kmh']]

y_algae = df['algae_type']
y_wall = df['wall_design']

# Encode the target variables
algae_encoder = LabelEncoder()
wall_encoder = LabelEncoder()

y_algae_encoded = algae_encoder.fit_transform(y_algae)
y_wall_encoded = wall_encoder.fit_transform(y_wall)

# Split the data
X_train, X_test, y_algae_train, y_algae_test, y_wall_train, y_wall_test = train_test_split(
    X, y_algae_encoded, y_wall_encoded, test_size=0.2, random_state=42)

# Train Random Forest model for algae type
rf_algae = RandomForestClassifier(n_estimators=100, random_state=42)
rf_algae.fit(X_train, y_algae_train)

# Train Random Forest model for wall design
rf_wall = RandomForestClassifier(n_estimators=100, random_state=42)
rf_wall.fit(X_train, y_wall_train)

# Evaluate the models
algae_pred = rf_algae.predict(X_test)
wall_pred = rf_wall.predict(X_test)

print("Algae Type Prediction Accuracy:", accuracy_score(y_algae_test, algae_pred))
print("\nAlgae Type Classification Report:")
print(classification_report(y_algae_test, algae_pred, 
                           target_names=algae_encoder.classes_))

print("\nWall Design Prediction Accuracy:", accuracy_score(y_wall_test, wall_pred))
print("\nWall Design Classification Report:")
print(classification_report(y_wall_test, wall_pred, 
                           target_names=wall_encoder.classes_))

# Feature importance
plt.figure(figsize=(12, 10))

plt.subplot(2, 1, 1)
feat_importances_algae = pd.Series(rf_algae.feature_importances_, index=X.columns)
feat_importances_algae.sort_values().plot(kind='barh')
plt.title('Feature Importance for Algae Type Prediction')

plt.subplot(2, 1, 2)
feat_importances_wall = pd.Series(rf_wall.feature_importances_, index=X.columns)
feat_importances_wall.sort_values().plot(kind='barh')
plt.title('Feature Importance for Wall Design Prediction')

plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

# Create a recommendation function for new data
def recommend_algae_wall(city, greywater, co2, humidity, temperature, sunlight, building_area, wind_speed):
    """
    Recommend the best algae type and wall design based on environmental factors.
    """
    # Format input as a dataframe row
    input_data = pd.DataFrame({
        'greywater_usage_liters': [greywater],
        'co2_levels_ppm': [co2],
        'humidity_percentage': [humidity],
        'temperature_celsius': [temperature],
        'sunlight_hours_average': [sunlight],
        'building_surface_area_sqm': [building_area],
        'wind_speed_kmh': [wind_speed]
    })
    
    # Predict algae type and wall design
    algae_prediction = algae_encoder.inverse_transform(rf_algae.predict(input_data))[0]
    wall_prediction = wall_encoder.inverse_transform(rf_wall.predict(input_data))[0]
    
    # Get prediction probabilities
    algae_probs = rf_algae.predict_proba(input_data)[0]
    wall_probs = rf_wall.predict_proba(input_data)[0]
    
    # Find top 3 algae recommendations with probabilities
    top_algae_indices = np.argsort(algae_probs)[::-1][:3]
    top_algae = [(algae_encoder.inverse_transform([idx])[0], 
                  algae_probs[idx] * 100) for idx in top_algae_indices]
    
    # Find top 3 wall design recommendations with probabilities
    top_wall_indices = np.argsort(wall_probs)[::-1][:3]
    top_wall = [(wall_encoder.inverse_transform([idx])[0], 
                wall_probs[idx] * 100) for idx in top_wall_indices]
    
    print(f"\nRecommendations for {city}:")
    print("Environmental Conditions:")
    print(f"  Greywater usage: {greywater} liters")
    print(f"  CO2 levels: {co2} ppm")
    print(f"  Humidity: {humidity}%")
    print(f"  Temperature: {temperature}°C")
    print(f"  Sunlight: {sunlight} hours/day")
    print(f"  Building surface area: {building_area} sq.m")
    print(f"  Wind speed: {wind_speed} km/h")
    
    print("\nTop Algae Type Recommendations:")
    for algae, prob in top_algae:
        print(f"  {algae}: {prob:.1f}% confidence")
    
    print("\nTop Wall Design Recommendations:")
    for wall, prob in top_wall:
        print(f"  {wall}: {prob:.1f}% confidence")
    
    return algae_prediction, wall_prediction

# Example usage
city = "New_York"
greywater = 240
co2 = 395
humidity = 68
temperature = 19.5
sunlight = 5.2
building_area = 1050
wind_speed = 14.5

recommend_algae_wall(city, greywater, co2, humidity, temperature, sunlight, building_area, wind_speed)

# Save the models for future use
import joblib
joblib.dump(rf_algae, 'algae_recommendation_model.pkl')
joblib.dump(rf_wall, 'wall_recommendation_model.pkl')
joblib.dump(algae_encoder, 'algae_encoder.pkl')
joblib.dump(wall_encoder, 'wall_encoder.pkl')