In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('2001_2023_MPCB_Unclean.csv')

# Drop unnamed columns
unnamed_columns = [col for col in df.columns if 'Unnamed' in col]
df = df.drop(columns=unnamed_columns)

In [None]:
# Display the first few rows and basic info
df.head()

In [None]:
df.tail(10)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop(['Month_name'],axis=1,inplace=True)

In [None]:
condition = df.astype(str).apply(lambda row: row.str.contains('Min Max Avg|Max Avg|Min|Max|Avg|Total', na=False)).any(axis=1)
df = df[~condition]

In [None]:
df.shape

In [None]:
last_5_columns = df.columns[-5:]

# Drop rows where all values in the last 5 columns are NaN
df = df.dropna(how='all', subset=last_5_columns)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
columns_to_fill = ['SO2', 'Nox', 'RSPM', 'SPM', 'AQI']

# Convert columns to numeric, forcing non-numeric values to NaN
for col in columns_to_fill:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values by group mean
for col in columns_to_fill:
    means = df.groupby('Places')[col].transform('mean')
    df[col].fillna(means, inplace=True)

# Convert float values to integers
for col in columns_to_fill:
    df[col] = df[col].round().astype('Int64')  # Use 'Int64' to retain NaN values if any

In [None]:
df.isnull().sum()

In [None]:
df['AQI'] = pd.to_numeric(df['AQI'], errors='coerce')

# Calculate mean AQI for each place
mean_aqi_by_place = df.groupby('Places')['AQI'].mean().reset_index()

# Rename columns for clarity
mean_aqi_by_place.columns = ['Place', 'Mean_AQI']

# Print the result
print(mean_aqi_by_place)

In [None]:
df = df.dropna(subset=['AQI'])

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')


In [None]:
df.shape

In [None]:
df.dropna(subset=['Date'], inplace=True)


In [None]:
df.shape

In [None]:
print(df.isnull().sum())

In [None]:
df.drop_duplicates(inplace=True)


In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Boxplots for detecting outliers
plt.figure(figsize=(8, 6))
sns.boxplot(data=df[['SO2', 'Nox', 'RSPM', 'SPM', 'AQI']])
plt.title('Boxplots of Features')
plt.show()

In [None]:
df.shape

In [None]:

# Histograms of features
df.hist(bins=30, figsize=(8, 6), edgecolor='black')
plt.suptitle('Feature Distributions')
plt.show()


In [None]:
# Scatter plots to understand relationships between features and AQI
plt.figure(figsize=(8, 6))
for i, column in enumerate(['SO2', 'Nox', 'RSPM', 'SPM']):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(x=df[column], y=df['AQI'])
    plt.title(f'{column} vs AQI')
    plt.xlabel(column)
    plt.ylabel('AQI')
plt.tight_layout()
plt.show()


In [None]:
# Label Encoding for City and AQI_Bucket
from sklearn.preprocessing import LabelEncoder

Places_encoder = LabelEncoder()
df['Places'] = Places_encoder.fit_transform(df['Places'])

In [None]:
df.corr()

In [None]:
# Correlation matrix
plt.figure(figsize=(8, 6))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Drop 'Date' and 'Places' columns
df = df.drop(columns=['Date', 'Places'])

In [None]:
# Split features and target variable
X = df.drop(columns=['AQI'])
y = df['AQI']


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predictions on the training data
y_train_pred = model.predict(X_train)

# Evaluate on training data
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
print(f"Training Mean Squared Error: {mse_train}")
print(f"Training R^2 Score: {r2_train}")

In [None]:
y_test_pred = model.predict(X_test)

# Evaluate on test data
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
print(f"Test Mean Squared Error: {mse_test}")
print(f"Test R^2 Score: {r2_test}")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions
y_test_pred = model.predict(X_test)

# Evaluate on test data
mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print(f"Test Mean Absolute Error (MAE): {mae_test}")
print(f"Test Mean Squared Error (MSE): {mse_test}")
print(f"Test Root Mean Squared Error (RMSE): {rmse_test}")
print(f"Test R^2 Score: {r2_test}")

In [None]:
if r2_train > r2_test + 0.1:  # adjust threshold as needed
    print("The model might be overfitting.")
else:
    print("The model seems to generalize well.")

In [None]:
# Residuals plot
plt.figure(figsize=(10, 6))
residuals = y_test - y_test_pred
sns.scatterplot(x=y_test_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals Plot')
plt.xlabel('Predicted AQI')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Actual vs Predicted plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--k')
plt.title('Actual vs Predicted AQI')
plt.xlabel('Actual AQI')
plt.ylabel('Predicted AQI')
plt.show()


In [None]:
import pickle

# To pickle the model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [None]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
import numpy as np

# Example input (1D array with multiple features)
input_data = np.array([7., 9., 117., 211.])  # Example feature values

# Reshape the input data to a 2D array with shape (1, number_of_features)
input_data_reshaped = input_data.reshape(1, -1)

# Use the reshaped data for prediction
prediction = loaded_model.predict(input_data_reshaped)
print(f"Prediction: {prediction}")
