In [None]:
! pip install plotly
! pip install folium
! pip install fuzzywuzzy
! pip install python-Levenshtein
! pip install pycountry-convert


In [None]:
# import pandas for structuring the data
import pandas as pd

# import numpy for numerical analysis
import numpy as np

# import libs for diagrams inline with the text
import matplotlib.pyplot as plt

import seaborn as sns

# other utilities
from sklearn import datasets, preprocessing, metrics

In [None]:
# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

# for diagramming 
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns

# For serialization and deserialization of data from/to file
import pickle

In [None]:
import folium

In [None]:
# read the json file from your data folder into a data frame
#df = pd.read_csv(r'C:\Users\chz\Documents\BI Exercise\\BI Exam\global air pollution dataset.csv')
df = pd.read_csv(r'C:\Users\chz\Documents\BI Exercise\Datasæt\global_air_pollution_dataset.csv')
df2 = pd.read_csv(r"C:\Users\chz\Documents\BI Exercise\Datasæt\2017_-_Cities_Community_Wide_Emissions.csv")

In [None]:
# Standardizing the 'Country' column in both DataFrames
df['Country'] = df['Country'].str.title().str.strip()
df2['Country'] = df2['Country'].str.title().str.strip()

# Merging df and df2 based on 'Country' after standardization
df_merged = pd.merge(df, df2, on='Country', how='outer')

# Sampling 50 rows from the merged DataFrame to verify the operation
# Note: Since the actual DataFrames are empty in this simulation, this step is for demonstration purposes.
sampled_df_merged = df_merged.sample(50)


In [None]:
df_merged.count()

In [None]:
df_merged.isnull().sum()

In [None]:
df_merged.drop(columns=['Gases included'], inplace=True)
df_merged.drop(columns=['Comment'], inplace=True)
df_merged.drop(columns=['Total Scope 1 Emissions (metric tonnes CO2e)'], inplace=True)
df_merged.drop(columns=['Total Scope 2 Emissions (metric tonnes CO2e)'], inplace=True)




In [None]:
# Convert nulls/NaNs to 'False'
df_merged['C40'] = df_merged['C40'].fillna('False')

# Convert any cell that contains "C40" to 'True', assuming "C40" indicates a true condition
# Adjust the condition as needed to match your data's specific representation of true
df_merged['C40'] = df_merged['C40'].apply(lambda x: 'True' if 'C40' in str(x) else 'False')


In [None]:
# Delete the null values from the data frame
df_merged = df_merged.dropna()

In [None]:
# Check the data frame for null values
df_merged.isnull().sum()

In [None]:
df_merged.count()

In [None]:
df_merged.sample(20)

In [None]:
# Extracting latitude and longitude from "City Location" and "Country Location" into new columns
df_merged[['City Latitude', 'City Longitude']] = df_merged['City Location'].str.extract(r'\(([^,]+), ([^)]+)\)')
df_merged[['Country Latitude', 'Country Longitude']] = df_merged['Country Location'].str.extract(r'\(([^,]+), ([^)]+)\)')

# Displaying the first few rows to ensure the transformation was successful
df_merged.head()

In [None]:
# Convert the latitude and longitude columns from strings to floats
df_merged['City Latitude'] = pd.to_numeric(df_merged['City Latitude'], errors='coerce')
df_merged['City Longitude'] = pd.to_numeric(df_merged['City Longitude'], errors='coerce')
df_merged['Country Latitude'] = pd.to_numeric(df_merged['Country Latitude'], errors='coerce')
df_merged['Country Longitude'] = pd.to_numeric(df_merged['Country Longitude'], errors='coerce')

In [None]:
df_merged.drop(columns=['City Location'], inplace=True)
df_merged.drop(columns=['Country Location'], inplace=True)

In [None]:
# Convert 'C40' from strings "True"/"False" to actual booleans
df_merged['C40'] = df_merged['C40'].map({'True': True, 'False': False})

# Create two new columns: 'C40_True' and 'C40_False'
df_merged['C40_True'] = df_merged['C40'].astype(int)  # This will convert True to 1 and False to 0
df_merged['C40_False'] = (~df_merged['C40']).astype(int)  # This inverts the boolean and then converts to 0/1



In [None]:
df_merged.drop(columns=['C40'], inplace=True)

In [None]:
df = df_merged

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import pycountry_convert as pc

#applying continent to the dataset for future use of folium mapping
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return None  # For countries that don't match

# Apply the conversion function to your DataFrame
df['Continent'] = df['Country'].apply(country_to_continent)
# Filter for other continents
north_american_countries_df = df[df['Continent'] == 'North America']
south_american_countries_df = df[df['Continent'] == 'South America']
asian_countries_df = df[df['Continent'] == 'Asia']
african_countries_df = df[df['Continent'] == 'Africa']
oceania_countries_df = df[df['Continent'] == 'Oceania']
Europe_df = df[df['Continent'] == 'Europe']




In [None]:
df.sample(20)

In [None]:
# We have to convert the data from float to int in order to use pandas to calculate the correlations
numeric_df = df.select_dtypes(include=['float64', 'int64'])


df_cleaned = numeric_df.dropna()
# Calculate the correlation matrix
corr_matrix = df_cleaned.corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualizing relationships between all numerical features
sns.pairplot(df.select_dtypes(include=['float64', 'int64']))
plt.show()


In [None]:
plt.xlabel('AQI Value')
plt.ylabel('PM2.5 AQI Value')
plt.scatter(df['AQI Value'], df['PM2.5 AQI Value'], color='green')
plt.show()

In [None]:
sns.distplot(df['AQI Value'],  label='AQI Value', norm_hist=True)  

In [None]:
sns.distplot(df['PM2.5 AQI Value'],  label='PM2.5 AQI Value', norm_hist=True) 

In [None]:
# Grouping the data by 'Country' and calculating the mean 'AQI Value' for each country
country_aqi_means = df.groupby('Country')['AQI Value'].mean()

# Sorting the countries by AQI value for better visualization
country_aqi_means = country_aqi_means.sort_values()

# Creating the bar chart
plt.figure(figsize=(15, 25)) 
plt.barh(country_aqi_means.index, country_aqi_means.values, color='skyblue') # Horizontal bar chart
plt.xlabel('Average AQI Value')
plt.ylabel('Country')
plt.title('Average AQI Value by Country')
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.

plt.show()

In [None]:
# Grouping the data by 'Country' and calculating the mean 'PM2.5 AQI Value' for each country
country_pm25_means = df.groupby('Country')['PM2.5 AQI Value'].mean()
# Sorting the countries by PM2.5 AQI value for better visualization
country_pm25_means = country_pm25_means.sort_values()
# Creating the bar chart
plt.figure(figsize=(15, 25))
plt.barh(country_pm25_means.index, country_pm25_means.values, color='skyblue') # Horizontal bar chart
plt.xlabel('Average PM2.5 AQI Value')
plt.ylabel('Country')
plt.title('Average PM2.5 AQI Value by Country')
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
plt.show()


In [None]:
X = df['AQI Value'].values.reshape(-1, 1)
y = df['PM2.5 AQI Value'].values.reshape(-1, 1)

In [None]:
# plot all
plt.ylabel('PM2.5 AQI Value')
plt.xlabel('AQI Value')
plt.scatter(X, y, color='blue')
plt.show()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.15) 

In [None]:
# the shape of the subsets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# creating an instance of Linear Regression model
myreg = LinearRegression()

In [None]:
# fit it to our data
myreg.fit(X_train, y_train)
myreg

In [None]:
# get the calculated coefficients
a = myreg.coef_
b = myreg.intercept_

In [None]:
a

In [None]:
b

In [None]:
y_predicted = myreg.predict(X_test)
y_predicted

In [None]:
y_test

In [None]:
# Visualise the Linear Regression 
plt.title('Linear Regression')
plt.scatter(X, y, color='green')
plt.plot(X_train, a*X_train + b, color='blue')
plt.plot(X_test, y_predicted, color='orange')
plt.xlabel('length')
plt.ylabel('age')
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predicting on the test set
y_pred = myreg.predict(X_test)

# Calculating metrics
print("R^2: ", r2_score(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
#create a scatter plot of the 'AQI Value' and 'PM2.5 AQI Value' columns and color the points by the 'Country' column
fig = px.scatter(df, x='AQI Value', y='PM2.5 AQI Value', color='Country', title='AQI Value vs PM2.5 AQI Value')
fig.show()


In [None]:
# divide the data into 5 clusters using the KMeans algorithm
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(df[['AQI Value', 'PM2.5 AQI Value']])
df['cluster'] = kmeans.predict(df[['AQI Value', 'PM2.5 AQI Value']])
df.sample(10)



In [None]:
# create a scatter plot of the 'AQI Value' and 'PM2.5 AQI Value' columns and color the points by the 'cluster' column
fig = px.scatter(df, x='AQI Value', y='PM2.5 AQI Value', color='cluster', title='AQI Value vs PM2.5 AQI Value')
fig.show()


In [None]:
# Fit the model and predict clusters
kmeans = KMeans(n_clusters=5, random_state=42).fit(df_filtered[['AQI Value', 'PM2.5 AQI Value']])
df_filtered['cluster'] = kmeans.labels_

# Analyze centroids
centroids = kmeans.cluster_centers_
print("Centroids:\n", centroids)

# Plotting clusters and centroids
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_filtered, x='AQI Value', y='PM2.5 AQI Value', hue='cluster', palette='viridis')
plt.scatter(centroids[:, 0], centroids[:, 1], s=100, c='red', label='Centroids')
plt.legend()
plt.show()


In [None]:
population_data = df[['Country', 'Population']]

# Grouping and aggregating population data by country
population_by_country = population_data.groupby('Country')['Population'].sum().reset_index()

# Creating a pivot table with 'Country' as index
pivot_population = population_by_country.set_index('Country')

# Creating the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data=pivot_population, cmap='YlGnBu', annot=True, fmt=',.0f', linewidths=.5)
plt.title('Population by Country')
plt.xlabel('population')
plt.ylabel('Country')
plt.show()

In [None]:
# Splitting the 'Country Location' column into separate longitude and latitude columns
df[['Latitude', 'Longitude']] = df['Country Location'].str.strip('()').str.split(', ', expand=True).astype(float)

# Creating a 3D scatter plot
scatter_plot = go.Scatter3d(
    x=df['Longitude'],
    y=df['Latitude'],
    z=df['Population year'],
    mode='markers',
    marker=dict(
        size=5,
        color='blue',                # Set color to an array/list of desired values
        opacity=0.8
    )
)

# Setting layout
layout = go.Layout(
    title='3D Population Map',
    scene=dict(
        xaxis=dict(title='Country Longitude'),
        yaxis=dict(title='Country Latitude'),
        zaxis=dict(title='Population')
    )
)

# Combining data and layout into a figure
fig = go.Figure(data=[scatter_plot], layout=layout)

# Show the figure
fig.show()

In [None]:
import folium
import json

# Load the GeoJSON data from a local file
with open(r"C:\Users\chz\Documents\BI Exercise\Datasæt\custom.geo.json", 'r', encoding='utf-8') as f:
    geojson_data = json.load(f)

# Assuming 'df' is your DataFrame and already correctly set up
m = folium.Map(location=[df['Country Latitude'].mean(), df['Country Longitude'].mean()], zoom_start=3)

# Add markers for each data point
for index, row in df.iterrows():
    folium.Marker([row['Country Latitude'], row['Country Longitude']], popup=row['Country']).add_to(m)

# Add polygon overlays for countries using the loaded GeoJSON data
folium.GeoJson(data=geojson_data).add_to(m)

# Save and display the map
m.save('map.html')


In [None]:
m