In [1]:
!pip install pandas scikit-learn fastapi uvicorn joblib catboost xgboost

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Data Processing


In [4]:
def load_data():
    df = pd.read_csv("algae_weather_greywater_dataset.csv")
    return df

In [5]:
def clean_data(df):
    df.dropna(inplace=True)  # Remove missing values
    df.drop_duplicates(inplace=True)  # Remove duplicates

    return df

In [6]:
df = load_data()
df = clean_data(df)
print(df.head())

   temperature   humidity  sunlight_hours   rainfall        ph  nitrogen  \
0    29.598627  69.315060        5.597853   4.782249  8.515999  5.652022   
1    26.219825  64.910465        6.888061   2.159025  8.084218  9.269282   
2    27.840103  55.736831       11.079371   8.091583  5.943362  6.011351   
3    34.622127  94.334701        6.358203  10.535987  8.281318  8.488923   
4    22.698592  67.965610        7.666561   9.035893  8.722992  6.726856   

   phosphorus        bod   algae_type  algae_encoded  
0    3.486663  37.195444  Scenedesmus              3  
1    3.003581  26.249193  Scenedesmus              3  
2    2.578581  13.398934   Dunaliella              1  
3    2.138962  58.226260  Scenedesmus              3  
4    3.337438  56.047699  Scenedesmus              3  


In [7]:
X = df.drop(["algae_type", "algae_encoded"], axis=1)
y = df['algae_encoded']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
algae_counts = y.value_counts()
algae_counts


algae_encoded
3    302
1    302
4    302
0    302
2    302
Name: count, dtype: int64

# RF model


In [8]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [9]:
y_pred = rf.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of algae type:", accuracy)

Accuracy of algae type: 0.9039735099337748


In [11]:
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(y.unique()))
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of algae type (XGBoost):", accuracy)

Accuracy of algae type (XGBoost): 0.9072847682119205


In [13]:
import pickle

In [14]:
with open('model_pkl', 'wb') as files:
    pickle.dump(xgb_model, files)