In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
idx=pd.IndexSlice
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
data = pd.read_csv('AQI and Lat Long of Countries.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.loc[data.isna().any(axis=1)]

In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='City').sum()

In [None]:
data.loc[data.duplicated(subset='City')]

In [None]:
data = data.drop_duplicates(subset='City')

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
# IN Data analysis 

In [None]:
df=data.Country.value_counts().sort_values(ascending=False)[:20].reset_index()
plt.figure(figsize=(16, 8))
sns.barplot(data=df, y='index', x='Country')
plt.title('Top 20 Countries Value Counts in Data');

In [None]:
for x in ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']:
    
    plt.figure(figsize=(16, 8))
    sns.scatterplot(data['lng'], data['lat'], hue=data[x])
    plt.title(f'Map Distribution of {x}')

In [None]:
# Column value_counts 

for x in ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']:
    print(data[x].value_counts().sort_values(ascending=False))
    print('-------')
    print('')

In [None]:
# Histogram for each value
for x in data.columns:
    if data[x].dtype != 'O' and x != 'lat' and x != 'lng':
        sns.histplot(data[x])
        plt.show()

In [None]:
#Sum of each columns /Dirtiest Country in terms of each value
for x in data.columns:
    if data[x].dtype != 'O' and x != 'lat' and x != 'lng':
        df = data.groupby('Country')[x].sum().sort_values(ascending=False).reset_index()[:15]
        plt.figure(figsize=(16,8))
        sns.barplot(data=df, x='Country', y=x)
        plt.xticks(rotation=45)
        plt.title(f"Top 15 Countries with {x}")

In [None]:
#Above visualization in pie chart

In [None]:
for x in data.columns:
    if data[x].dtype != 'O' and x != 'lat' and x != 'lng':
        df = data.groupby('Country')[x].sum().sort_values(ascending=False).reset_index()[:5]
        
        fig, ax = plt.subplots(figsize=(4, 4))
        
        ax.pie(
            x=df[x],
            startangle=90,
            labels=df['Country'],
            autopct="%.0f%%",
            shadow=True,
            explode=[0.1, 0, 0, 0, 0]
            )
    
        plt.title(f"Top 5 Country of {x} Distribution", fontsize=12)

In [None]:
#Top 20 city with each value
for x in data.columns:
    if data[x].dtype != 'O' and x != 'lat' and x != 'lng':
        df = data.loc[:, ['City', x]].sort_values(x, ascending=False)[:20]
        plt.figure(figsize=(16, 4))
        sns.barplot(data=df, x=x, y='City')
        plt.axvline(data[x].mean(), ls='--', color='black', alpha=0.3)
        plt.legend([f'Avg {x}'])
        plt.title(f'Top 20 City with Highest {x}')

In [None]:
#AQI Value Analysis

In [None]:
sns.histplot(AQI['AQI Value'], kde=True)

In [None]:
#Function for Value Distribution

def AQI_value(data, feature):
    blanks = []
    for x in data[feature]:
        if x >= 0 and x <= 50:
            blanks.append('0-50')
        elif x >= 51 and x <= 100:
            blanks.append('51-100')
        elif x >= 101 and x <= 150:
            blanks.append('101-150')
        elif x >= 151 and x <= 200:
            blanks.append('151-200')
        elif x >= 201 and x <= 300:
            blanks.append('201-300')
        else:
            blanks.append('301-500')
            
    return blanks 

In [None]:
AQI = data.loc[:, ['Country', 'City', 'AQI Value', 'AQI Category']]
AQI['AQI Category Index'] = AQI_value(AQI, 'AQI Value')

In [None]:
AQI

In [None]:
def AQI_category(data, feature):
    blanks=[]
    for x in data[feature]:
        if x == '0-50':
            blanks.append('Good')
        elif x == '51-100':
            blanks.append('Moderate')
        elif x == '101-150':
            blanks.append('Unhealthy for Sensitive Groups')
        elif x == '151-200':
            blanks.append('Unhealthy')
        elif x == '201-300':
            blanks.append('Very Unhealthy')
        else:
            blanks.append('Hazardous')
            
    return blanks

In [None]:
AQI['AQI Category'] = AQI_category(AQI, 'AQI Category Index')

In [None]:
df=AQI['AQI Category'].value_counts().reset_index()
plt.figure(figsize=(16,4))
sns.barplot(data=df, y='index', x='AQI Category')
plt.title('AQI Category Value Count', fontsize=12)

In [None]:
AQI_Country =\
AQI.groupby('Country')['AQI Value'].agg(['mean', 'sum', 'max'])\
                                                                .rename(columns={'mean':'Avg_AQI',
                                                                                'sum':'Overall_AQI',
                                                                                'max':'Max_AQI'})\
                                                                .sort_values('Overall_AQI', ascending=False)

AQI_Country 

In [None]:
for x in AQI_Country.columns:
    plt.figure(figsize=(16, 8))
    sns.barplot(
        data=AQI_Country.loc[:, [x]].reset_index().sort_values(x,ascending=False)[:10],
        x='Country',
        y=x)
    plt.xticks(rotation=45)
    plt.title(f'Top 15 Countries with Highest {x}')
    

In [None]:
# CO AQI Value analysis

In [None]:
data['CO AQI Value'].unique()

In [None]:
data['CO AQI Category'].unique()

In [None]:
CO_AQI = data.loc[:, ['Country', 'City', 'CO AQI Value', 'CO AQI Category']]

CO_AQI

In [None]:
sns.histplot(CO_AQI['CO AQI Value'], bins=80)

In [None]:
def CO(data, feature):
    blanks=[]
    for x in data[feature]:
        if x >= 0 and x <= 20:
            blanks.append('0-20')
        elif x >= 21 and x <= 50:
            blanks.append('21-50')
        elif x >= 51 and x <= 100:
            blanks.append('51-100')
        else:
            blanks.append('100+')
    return blanks

In [None]:
CO_AQI['CO AQI Value Index'] = CO(data, 'CO AQI Value')

In [None]:
CO_AQI['CO AQI Value Index'].value_counts()

In [None]:
CO_AQI['CO AQI Category'].value_counts()

In [None]:
CO_AQI_Country =\
CO_AQI.groupby('Country')['CO AQI Value'].agg(['sum', 'mean', 'max'])\
                                                                    .rename(columns={'sum':'Overall_CO_AQI',
                                                                                    'mean':'Avg_CO_AQI',
                                                                                    'max':'Max_CO_AQI'})\
                                                                    .sort_values('Overall_CO_AQI', ascending=False)

CO_AQI_Country

In [None]:
for x in CO_AQI_Country.columns:
    plt.figure(figsize=(16, 8))
    sns.barplot(
        data=CO_AQI_Country.loc[:, [x]].reset_index().sort_values(x,ascending=False)[:10],
        x='Country',
        y=x)
    plt.xticks(rotation=45)
    plt.title(f'Top 15 Countries with Highest {x}')

In [None]:
# Ozone AQI Value analysis

In [None]:
Ozone_AQI = data.loc[:, ['Country', 'City', 'Ozone AQI Value', 'Ozone AQI Category']]

Ozone_AQI

In [None]:
sns.histplot(Ozone_AQI['Ozone AQI Value'])

In [None]:
Ozone_AQI['Ozone AQI Value'].max()

In [None]:
def Ozone_value(data, feature):
    blanks = []
    for x in data[feature]:
        if x >= 0 and x <= 50:
            blanks.append('0-50')
        elif x >= 51 and x <= 100:
            blanks.append('51-100')
        elif x >= 101 and x <= 150:
            blanks.append('101-150')
        elif x >= 151 and x <= 200:
            blanks.append('151-200')
        else:
            blanks.append('200+')
            
    return blanks 

In [None]:
Ozone_AQI['Ozone AQI Value Index'] = Ozone_value(data, 'Ozone AQI Value')

In [None]:
df=Ozone_AQI['Ozone AQI Value Index'].value_counts().reset_index()

df

In [None]:
# fig, ax = plt.subplots()

# ax.pie(
#     x=df['Ozone AQI Value Index'],
#     startangle=90,
#     labels=df['index'],
#     autopct="%.0f%%",
#     shadow=True,
#     explode=[0.1, 0, 0, 0, 0]
#             );

In [None]:
df=Ozone_AQI['Ozone AQI Category'].value_counts().reset_index()
plt.figure(figsize=(16,4))
sns.barplot(data=df, y='index', x='Ozone AQI Category')
plt.title("Ozone AQI Category Value Counts")

In [None]:
Ozone_AQI_Country=\
Ozone_AQI.groupby('Country')['Ozone AQI Value'].agg(['sum', 'mean', 'max'])\
                                                                    .rename(columns={'sum':'Overall_Ozone_AQI',
                                                                                    'mean':'Avg_Ozone_AQI',
                                                                                    'max':'Max_Ozone_AQI'})\
                                                                     .sort_values('Overall_Ozone_AQI', ascending=False)

Ozone_AQI_Country

In [None]:
for x in Ozone_AQI_Country.columns:
    plt.figure(figsize=(16, 8))
    sns.barplot(
        data=Ozone_AQI_Country.loc[:, [x]].reset_index().sort_values(x,ascending=False)[:10],
        x='Country',
        y=x)
    plt.xticks(rotation=45)
    plt.title(f'Top 15 Countries with Highest {x}')

In [None]:
#Similar Analysis can be conduted to other values such as ['NO2 AQI Value', 'PM2.5 AQI Value']

In [None]:
# Each category value_counts for each country
def category_count(data, feature):
    df = data.groupby('Country')[feature].value_counts().unstack()
    return df

In [None]:
category_count(data, 'AQI Category')

In [None]:
category_count(data, 'CO AQI Category')

In [None]:
category_count(data, 'Ozone AQI Category')

In [None]:
category_count(data, 'NO2 AQI Category')

In [None]:
category_count(data, 'PM2.5 AQI Category')

In [None]:
#Avg Value for each AQI Value

def avg_AQI(data):
    result_df = pd.DataFrame()
    for x in data.columns:
        if data[x].dtype != 'O' and x!='lat' and x!='lng':
            df = data.groupby('Country')[x].mean()
            result_df = result_df.append(df)
    return result_df.T

In [None]:
avg_AQI_country = avg_AQI(data)

avg_AQI_country

In [None]:
#Correlation

In [None]:
data.corr()['AQI Value'].sort_values(ascending=False)

In [None]:
sns.pairplot(data)

In [None]:
sns.heatmap(data.corr(), annot=True)