In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Load the dataset
df = pd.read_csv('/content/city_day.csv')

In [None]:
# Display the shape of the dataset
df.shape

(29531, 16)

In [None]:
# Display the number of unique values in each column
df.nunique()

City             26
Date           2009
PM2.5         11716
PM10          12571
NO             5776
NO2            7404
NOx            8156
NH3            5922
CO             1779
SO2            4761
O3             7699
Benzene        1873
Toluene        3608
Xylene         1561
AQI             829
AQI_Bucket        6
dtype: int64

In [None]:
# Display statistical summary of the dataset
df.describe()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,24933.0,18391.0,25949.0,25946.0,25346.0,19203.0,27472.0,25677.0,25509.0,23908.0,21490.0,11422.0,24850.0
mean,67.450578,118.127103,17.57473,28.560659,32.309123,23.483476,2.248598,14.531977,34.49143,3.28084,8.700972,3.070128,166.463581
std,64.661449,90.60511,22.785846,24.474746,31.646011,25.684275,6.962884,18.133775,21.694928,15.811136,19.969164,6.323247,140.696585
min,0.04,0.01,0.02,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,13.0
25%,28.82,56.255,5.63,11.75,12.82,8.58,0.51,5.67,18.86,0.12,0.6,0.14,81.0
50%,48.57,95.68,9.89,21.69,23.52,15.85,0.89,9.16,30.84,1.07,2.97,0.98,118.0
75%,80.59,149.745,19.95,37.62,40.1275,30.02,1.45,15.22,45.57,3.08,9.15,3.35,208.0
max,949.99,1000.0,390.68,362.21,467.63,352.89,175.81,193.86,257.73,455.03,454.85,170.37,2049.0


In [None]:
# Display information about the dataset, including column names and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [None]:
# Fill missing values in the target variable
df['AQI'].fillna(df['AQI'].mean(), inplace=True)

In [None]:
df.drop(['NH3', 'Benzene', 'Toluene', 'Xylene','AQI_Bucket'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,CO,SO2,O3,AQI
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,0.92,27.64,133.36,166.463581
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,0.97,24.55,34.06,166.463581
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,17.4,29.07,30.7,166.463581
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,1.7,18.59,36.08,166.463581
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,22.1,39.33,39.31,166.463581


In [None]:
# Prepare the features and target variable
X = df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3']]
y = df['AQI']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
# Fill missing values after splitting the data
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

In [None]:
# Print the shape of the training and testing sets
print(X_train.shape)
print(X_test.shape)

(23624, 8)
(5907, 8)


In [None]:
# Encode categorical variables if needed
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

In [None]:
# Create a Linear Regression model
reg = LinearRegression()

In [None]:
# Train the model on the training data
reg.fit(X_train, y_train)

In [None]:
# Predict the target variable for the test data
y_pred = reg.predict(X_test)

In [None]:
# Calculate the R2 score
r2_score(y_test,y_pred)

0.79408101368113

In [None]:
#coefficients of the linear regression model
reg.coef_

array([ 0.99276511,  0.26415241, -0.07027722,  0.40674534,  0.1465037 ,
       10.23779984,  0.65622154,  0.19177475])

In [None]:
#intercept of the linear regression model
reg.intercept_

14.211661775436397

In [None]:
# Calculate and print the mean squared error and R2 score
print("MSE",mean_squared_error(y_test,y_pred))
print("R2 score",r2_score(y_test,y_pred))

MSE 3247.2871743817623
R2 score 0.79408101368113
