In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


import warnings

In [11]:
data = 'data_date.csv'
df = pd.read_csv(data)

In [12]:
df.head()

Unnamed: 0,Date,Country,Status,AQI Value
0,2022-07-21,Albania,Good,14
1,2022-07-21,Algeria,Moderate,65
2,2022-07-21,Andorra,Moderate,55
3,2022-07-21,Angola,Unhealthy for Sensitive Groups,113
4,2022-07-21,Argentina,Moderate,63


In [13]:
print(df.isnull().sum())
df.dropna(subset=['AQI Value'], inplace=True) #Drop  rows with missing AQI or fill as needed
df.fillna(df.mean(numeric_only = True), inplace = True) #Fill other missing values with mean
print(df.columns.tolist())

Date         0
Country      0
Status       0
AQI Value    0
dtype: int64
['Date', 'Country', 'Status', 'AQI Value']


In [14]:
# Print actual column names
print("Columns:", df.columns.tolist())

# Identify the AQI column
aqi_column = None
for col in df.columns:
    if 'aqi' in col.lower():
        aqi_column = col
        break

if aqi_column:
    df.dropna(subset=[aqi_column], inplace=True)
    df.fillna(df.mean(numeric_only=True), inplace=True)
    print("Cleaned Data Columns:", df.columns)
else:
    print("AQI column not found in the dataset.")


Columns: ['Date', 'Country', 'Status', 'AQI Value']
Cleaned Data Columns: Index(['Date', 'Country', 'Status', 'AQI Value'], dtype='object')


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Extract features from Date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Convert to datetime
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# 2. Encode categorical variables: Country and Status
label_enc_country = LabelEncoder()
label_enc_status = LabelEncoder()

df['Country_encoded'] = label_enc_country.fit_transform(df['Country'].astype(str))
df['Status_encoded'] = label_enc_status.fit_transform(df['Status'].astype(str))

# 3. Define target and features
x = df[['Year', 'Month', 'Day', 'Country_encoded', 'Status_encoded']]
y = df['AQI Value']

# 4. Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 5. Feature scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

print("Preprocessing completed successfully!")


Preprocessing completed successfully!


In [16]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
#Initialize the models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state = 42),
    "Decision Tree": DecisionTreeRegressor(random_state = 42)
}
#train and evaluate each model
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\n {name}")
    print(f"R2 Score :{r2:.4f}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")



 LinearRegression
R2 Score :0.6305
MAE: 16.37
RMSE: 28.57

 RandomForest
R2 Score :0.9184
MAE: 8.58
RMSE: 13.43

 Decision Tree
R2 Score :0.8397
MAE: 10.58
RMSE: 18.82
