In [None]:
import pandas as pd
import os
from scipy import stats

csv_files = [
    "AQI.csv", "Climate.csv", "ForestCover.csv", "Population.csv",
    "SoilMoisture.csv", "Waste.csv", "WaterBodies.csv"
]
dataframes = []
for file in csv_files:
    try:
        if not os.path.exists(file):
            print(f"Warning: File '{file}' not found. Skipping this file.")
            continue
        
        df = pd.read_csv(file)
        if 'District' not in df.columns:
            print(f"Warning: 'District' column not found in {file}. Skipping this file.")
            continue
        
        df['District'] = df['District'].astype(str)
        
        dataframes.append(df)
        print(f"Successfully loaded {file}")
    except Exception as e:
        print(f"Error loading {file}: {str(e)}")

if not dataframes:
    print("Error: No valid CSV files were loaded. Exiting.")
    exit(1)

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on='District', how='outer')

print(f"Merged data shape: {merged_df.shape}")

def fill_missing(column):
    if column.dtype == 'object':
        return column.fillna(column.mode().iloc[0] if not column.mode().empty else "Unknown")
    else:
        return column.fillna(column.mean())

for col in merged_df.columns:
    if col != 'District':
        merged_df[col] = fill_missing(merged_df[col])

try:
    merged_df.to_csv("raw.csv", index=False)
    print("Merged CSV file 'raw.csv' has been created successfully.")
    print(f"Total rows: {len(merged_df)}, Total columns: {len(merged_df.columns)}")
except Exception as e:
    print(f"Error saving merged CSV: {str(e)}")

missing_info = merged_df.isnull().sum()
if missing_info.sum() > 0:
    print("\nColumns with missing values after filling:")
    print(missing_info[missing_info > 0])
else:
    print("\nNo missing values in the final dataset.")

In [None]:
import pandas as pd
from functools import reduce

file_names = ["AQI.csv", "Climate.csv", "ForestCover.csv", "Population.csv", 
              "SoilMoisture.csv", "Waste.csv", "WaterBodies.csv"]

aqi_df = pd.read_csv("AQI.csv")
climate_df = pd.read_csv("Climate.csv")
forest_df = pd.read_csv("ForestCover.csv")
population_df = pd.read_csv("Population.csv")
soil_df = pd.read_csv("SoilMoisture.csv")
waste_df = pd.read_csv("Waste.csv")
water_df = pd.read_csv("WaterBodies.csv")

dfs = [aqi_df, climate_df, forest_df, population_df, soil_df, waste_df, water_df]

merged_df = reduce(lambda left, right: pd.merge(left, right, on="District", how="outer"), dfs)

for column in merged_df.columns:
    if merged_df[column].dtype == "object":  
        mode_value = merged_df[column].mode()[0] if not merged_df[column].mode().empty else None
        merged_df[column].fillna(mode_value, inplace=True)
    else:  
        mean_value = merged_df[column].mean()
        merged_df[column].fillna(mean_value, inplace=True)

merged_df.to_csv("raw.csv", index=False)

print("Merged CSV file 'raw.csv' has been created.")


In [1]:
import pandas as pd

file_names = ["AQI.csv", "Climate.csv", "ForestCover.csv", "Population.csv", 
              "SoilMoisture.csv", "Waste.csv", "WaterBodies.csv"]

aqi_df = pd.read_csv("AQI.csv")
climate_df = pd.read_csv("Climate.csv")
forest_df = pd.read_csv("ForestCover.csv")
population_df = pd.read_csv("Population.csv")
soil_df = pd.read_csv("SoilMoisture.csv")
waste_df = pd.read_csv("Waste.csv")
water_df = pd.read_csv("WaterBodies.csv")

dfs = [aqi_df, climate_df, forest_df, population_df, soil_df, waste_df, water_df]
for df in dfs:
    df['District'] = df['District'].astype(str)

merged_df = pd.concat(dfs, ignore_index=True).groupby("District", as_index=False).first()

for column in merged_df.columns:
    if merged_df[column].dtype == "object": 
        mode_value = merged_df[column].mode()[0] if not merged_df[column].mode().empty else None
        merged_df[column].fillna(mode_value, inplace=True)
    else: 
        mean_value = merged_df[column].mean()
        merged_df[column].fillna(mean_value, inplace=True)

merged_df.to_csv("raw.csv", index=False)

print("Merged CSV file 'raw.csv' has been created.")


Merged CSV file 'raw.csv' has been created.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df[column].fillna(mean_value, inplace=True)
