In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [None]:
df = pd.read_csv('data/airquality.csv') 
print("🔹 Original Data:")
print(df.head())

In [None]:
print("\n🔹 Before Cleaning (Nulls per column):")
print(df.isnull().sum())

In [None]:
df = df.dropna()

print("\n🔹 After Cleaning (Nulls per column):")
print(df.isnull().sum())

In [None]:
columns = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket']
column_new = df['StationId']
subset_2 = pd.DataFrame(column_new, columns = ['StationId'])

In [None]:
merged_subset = pd.concat([df[columns], subset_2], axis = 1)

In [None]:
print("\n🔹 Data After Integration:")
merged_subset.head()

In [None]:
print("\n🔹 Before Error Correction (Negative values count):")
print((df[['PM2.5', 'PM10']] < 0).sum())

In [None]:
df['PM2.5'] = df['PM2.5'].apply(lambda x: df['PM2.5'].mean() if x < 0 else x)
df['PM10'] = df['PM10'].apply(lambda x: df['PM10'].mean() if x < 0 else x)

In [None]:
print("\n🔹 After Error Correction (Negative values count):")
print((df[['PM2.5', 'PM10']] < 0).sum())

In [None]:
print("\n🔹 Before Encoding:")
print(df['AQI_Bucket'].value_counts())

le = LabelEncoder()
df['AQI_Bucket'] = le.fit_transform(df['AQI_Bucket'])

print("\n🔹 After Encoding:")
print(df['AQI_Bucket'].value_counts())

In [None]:
print("\n🔹 Final Data Before Train-Test Split:")
print(df[['PM2.5', 'PM10', 'AQI_Bucket', 'AQI']].head())

In [None]:
X = df[['PM2.5', 'PM10', 'AQI_Bucket']]
y = df['AQI']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale PM2.5 and PM10
sc = StandardScaler()

In [None]:
print("\n🔹 Before Scaling:")
print(X_train[['PM2.5', 'PM10']].head())

X_train[['PM2.5', 'PM10']] = sc.fit_transform(X_train[['PM2.5', 'PM10']])
X_test[['PM2.5', 'PM10']] = sc.transform(X_test[['PM2.5', 'PM10']])
    
print("\n🔹 After Scaling:")
print(X_train[['PM2.5', 'PM10']].head())

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
prediction = lr.predict(X_test)
print("\n🔹 Model Evaluation:")
print("MSE:", mean_squared_error(y_test, prediction))
print("R2 Score:", r2_score(y_test, prediction))