In [None]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
try:
    df_raw = pd.read_csv('afa2e701598d20110228.csv', sep=';')
    print(df_raw.head())
    print(df_raw.info())
    print(df_raw.shape)
    print(df_raw.describe().T)


except FileNotFoundError:
    print("Error: 'afa2e701598d20110228.csv' not found.")

missing_val = df_raw.isnull().sum()
missing_per = (missing_val / len(df_raw)) * 100
print("\nMissing Values : \n", missing_val)
print("\nMissing Value percentage : \n", missing_per)

   id        date    NH4  BSK5  Suspended     O2    NO3    NO2    SO4    PO4  \
0   1  17.02.2000  0.330  2.77       12.0  12.30   9.50  0.057  154.0  0.454   
1   1  11.05.2000  0.044  3.00       51.6  14.61  17.75  0.034  352.0  0.090   
2   1  11.09.2000  0.032  2.10       24.5   9.87  13.80  0.173  416.0  0.200   
3   1  13.12.2000  0.170  2.23       35.6  12.40  17.13  0.099  275.2  0.377   
4   1  02.03.2001  0.000  3.03       48.8  14.69  10.00  0.065  281.6  0.134   

       CL  
0   289.5  
1  1792.0  
2  2509.0  
3  1264.0  
4  1462.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         2861 non-null   int64  
 1   date       2861 non-null   object 
 2   NH4        2858 non-null   float64
 3   BSK5       2860 non-null   float64
 4   Suspended  2845 non-null   float64
 5   O2         2858 non-null   float64
 6   NO3        2860

In [None]:
df_raw['date'] = pd.to_datetime(df_raw['date'], format='mixed', dayfirst=True)
print(df_raw.info())

df_raw = df_raw.sort_values(by=['id', 'date'])
print(df_raw.head())

df_raw['year'] = df_raw['date'].dt.year
df_raw['month'] = df_raw['date'].dt.month
print(df_raw.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         2861 non-null   int64         
 1   date       2861 non-null   datetime64[ns]
 2   NH4        2858 non-null   float64       
 3   BSK5       2860 non-null   float64       
 4   Suspended  2845 non-null   float64       
 5   O2         2858 non-null   float64       
 6   NO3        2860 non-null   float64       
 7   NO2        2858 non-null   float64       
 8   SO4        2812 non-null   float64       
 9   PO4        2833 non-null   float64       
 10  CL         2812 non-null   float64       
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 246.0 KB
None
   id       date    NH4  BSK5  Suspended     O2    NO3    NO2    SO4    PO4  \
0   1 2000-02-17  0.330  2.77       12.0  12.30   9.50  0.057  154.0  0.454   
1   1 2000-05-11  0.044  3.00       51.6  14.61 

WEEK 2 :-

In [14]:
print(df_raw.columns)

pollutants = {'O2','NO3','NO2','SO4','PO4','CL'}

Index(['id', 'date', 'NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL', 'year', 'month'],
      dtype='object')


In [15]:
df_raw = df_raw.dropna(subset=pollutants)
df_raw.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5,2000,2
1,1,2000-05-11,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0,2000,5
2,1,2000-09-11,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0,2000,9
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0,2000,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,2001,3


In [16]:
df_raw.isnull().sum()

Unnamed: 0,0
id,0
date,0
NH4,2
BSK5,0
Suspended,2
O2,0
NO3,0
NO2,0
SO4,0
PO4,0


In [17]:
features = ['year', 'month', 'NH4', 'BSK5', 'Suspended']
targets = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']

In [18]:
print(df_raw[features + targets].isnull().sum())

year         0
month        0
NH4          2
BSK5         0
Suspended    2
O2           0
NO3          0
NO2          0
SO4          0
PO4          0
CL           0
dtype: int64


In [19]:
for col in ['NH4', 'Suspended']:
    df_raw[col] = df_raw[col].fillna(df_raw[col].mean())

print(df_raw[features + targets].isnull().sum())

year         0
month        0
NH4          0
BSK5         0
Suspended    0
O2           0
NO3          0
NO2          0
SO4          0
PO4          0
CL           0
dtype: int64


In [20]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2780 entries, 0 to 2860
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         2780 non-null   int64         
 1   date       2780 non-null   datetime64[ns]
 2   NH4        2780 non-null   float64       
 3   BSK5       2780 non-null   float64       
 4   Suspended  2780 non-null   float64       
 5   O2         2780 non-null   float64       
 6   NO3        2780 non-null   float64       
 7   NO2        2780 non-null   float64       
 8   SO4        2780 non-null   float64       
 9   PO4        2780 non-null   float64       
 10  CL         2780 non-null   float64       
 11  year       2780 non-null   int32         
 12  month      2780 non-null   int32         
dtypes: datetime64[ns](1), float64(9), int32(2), int64(1)
memory usage: 282.3 KB


In [21]:
X = df_raw[features]
y = df_raw[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (2224, 5)
Shape of X_test: (556, 5)
Shape of y_train: (2224, 6)
Shape of y_test: (556, 6)


In [22]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
multioutput_regressor = MultiOutputRegressor(rf_regressor)
multioutput_regressor.fit(X_train, y_train)

In [23]:
y_pred = multioutput_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 13824.347185328821
R-squared (R2): 0.020981853173880294


In [26]:
for i, target in enumerate(targets):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"Metrics for {target}:")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  R-squared (R2): {r2}")
    print("-" * 20)

Metrics for O2:
  Mean Squared Error (MSE): 17.28261516826043
  R-squared (R2): 0.20911997091226497
--------------------
Metrics for NO3:
  Mean Squared Error (MSE): 36.471245011430575
  R-squared (R2): 0.02800819437611035
--------------------
Metrics for NO2:
  Mean Squared Error (MSE): 0.33861324760665473
  R-squared (R2): -1.5353065408741178
--------------------
Metrics for SO4:
  Mean Squared Error (MSE): 2630.2442087134987
  R-squared (R2): 0.35865283185869523
--------------------
Metrics for PO4:
  Mean Squared Error (MSE): 0.18551572066780572
  R-squared (R2): 0.6733309433649184
--------------------
Metrics for CL:
  Mean Squared Error (MSE): 80261.56091411147
  R-squared (R2): 0.3920857194054107
--------------------


In [24]:
predictions = multioutput_regressor.predict(X_test)
print(predictions[:5])

[[8.13840e+00 6.77460e+00 2.45780e-01 3.49049e+01 6.46700e-01 3.39254e+01]
 [9.53620e+00 2.28780e+00 8.97300e-02 9.02796e+01 2.05370e-01 5.06413e+01]
 [7.64070e+00 1.83960e+00 1.34580e-01 2.03300e+01 2.26210e-01 1.83390e+01]
 [7.44480e+00 2.32900e+00 5.07700e-02 3.25306e+01 6.20220e-01 2.63612e+01]
 [1.36891e+01 2.01300e+00 2.48800e-02 9.68134e+01 7.48600e-02 5.46788e+01]]


In [27]:
import joblib
joblib.dump(multioutput_regressor, 'Predict_Model.pkl')

joblib.dump(features, 'Columns.pkl')

print("Model and column list saved successfully.")

Model and column list saved successfully.
