<a href="https://colab.research.google.com/github/anmar36a/Data-Analysis/blob/main/Predict_Air_Quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load main libraries
import os
from google.colab import drive

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

## Load dataset

In [2]:
drive.mount("/content/drive")
data_dir = "/content/drive/My Drive/Codigo/predict aqi"

df = pd.read_csv(f"{data_dir}/AQI-and-Lat-Long-of-Countries.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Display dataset characteristics
print(f'The dataset has {df.shape} rows and columns')
print(f'The columns are: {df.columns.tolist()}')
print(f'Some statistical values:\n{df.describe()}')
print(f'First rows of the dataset:\n{df.head()}')

The dataset has (16695, 7) rows and columns
The columns are: ['AQI Value', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value', 'lat', 'lng']
Some statistical values:
          AQI Value  CO AQI Value  Ozone AQI Value  NO2 AQI Value  \
count  16695.000000  16695.000000     16695.000000   16695.000000   
mean      62.998682      1.342138        31.767355       3.819647   
std       43.091971      2.371379        22.839343       5.880677   
min        7.000000      0.000000         0.000000       0.000000   
25%       38.500000      1.000000        20.000000       0.000000   
50%       52.000000      1.000000        29.000000       2.000000   
75%       69.000000      1.000000        38.000000       5.000000   
max      500.000000    133.000000       222.000000      91.000000   

       PM2.5 AQI Value           lat           lng  
count     16695.000000  16695.000000  16695.000000  
mean         59.821324     30.267148     -3.944485  
std          43.208298     22.9473

In [4]:
print(f'NULL values: \n {df.isnull().sum()}')
print(f'NA values: \n {df.isna().sum()}')
print(f'INF values: \n {np.isinf(df).any()}')

NULL values: 
 AQI Value          0
CO AQI Value       0
Ozone AQI Value    0
NO2 AQI Value      0
PM2.5 AQI Value    0
lat                0
lng                0
dtype: int64
NA values: 
 AQI Value          0
CO AQI Value       0
Ozone AQI Value    0
NO2 AQI Value      0
PM2.5 AQI Value    0
lat                0
lng                0
dtype: int64
INF values: 
 AQI Value          False
CO AQI Value       False
Ozone AQI Value    False
NO2 AQI Value      False
PM2.5 AQI Value    False
lat                False
lng                False
dtype: bool


# Model Training

In [5]:
# Prepare the dataset for training the model

# Divide dataset into groups for training and testing. The variable to predict is AQUI Value.
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["AQI Value"], axis = 1), df["AQI Value"], test_size=0.33, random_state=42)

print('The resulting datasets have the following shape: ')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

The resulting datasets have the following shape: 
(11185, 6)
(11185,)
(5510, 6)
(5510,)


In [6]:
# Prepare Forest Regressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

model1 = RandomForestRegressor()

In [7]:
# Train the model
model1.fit(X_train, y_train)

# Check scores
m1_score = model1.score(X_test, y_test)
print(f"The score on the training dataset is: {model1.score(X_train, y_train)}")
print(f"The score on the test dataset is: {m1_score}")

The score on the training dataset is: 0.9997237087745328
The score on the test dataset is: 0.9979556580430697


In [8]:
# Prepare and train Ada Boost Regressor
from sklearn.ensemble import AdaBoostRegressor

model2 = AdaBoostRegressor()

# Train the model
model2.fit(X_train, y_train)

# Check scores
m2_score = model2.score(X_test, y_test)
print(f"The score on the training dataset is: {model2.score(X_train, y_train)}")
print(f"The score on the test dataset is: {m2_score}")

The score on the training dataset is: 0.9120675344276582
The score on the test dataset is: 0.9103198025018331


In [9]:
# Display best model
if m1_score > m2_score:
  best_model = "Random Forest Regressor"
else:
  best_model = "Ada Boost Regressor"

print(f"The model with the highest accuracy is {best_model} with an accuracy "
  f"{round(abs(m1_score - m2_score), 4) * 100}% higher.")

The model with the highest accuracy is Random Forest Regressor with an accuracy 8.76% higher.


# Adresses extraction

In [11]:
# Get the address of a given coordinates
import geopy
from geopy.geocoders import Nominatim

rand_int = np.random.randint(0,len(df))
geolocator = Nominatim(user_agent="Andres")

coordinates_test = df.loc[rand_int, ["lat", "lng"]]
print(f"The info obtained from the geolocator is: \n{geolocator.reverse(coordinates_test).raw}")

def getAddress(coordinates):
  """Get a general address for a given coordinates"""
  location = geolocator.reverse(coordinates)

  area_type = location.raw["type"]
  try:  town = location.raw["address"]["town"]
  except: town = location.raw["address"]["state"]
  country = location.raw["address"]["country"]

  str_address = (f'{area_type} area in {town}, {country}')

  return str_address

print(getAddress(df.loc[rand_int, ["lat", "lng"]]))

The info obtained from the geolocator is: 
{'place_id': 223808794, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 62190680, 'lat': '55.033290050000005', 'lon': '82.91656361134753', 'class': 'building', 'type': 'apartments', 'place_rank': 30, 'importance': 9.99999999995449e-06, 'addresstype': 'building', 'name': '', 'display_name': '4, Потанинская улица, Депутатский, Центральный район, Новосибирск, городской округ Новосибирск, Новосибирская область, Сибирский федеральный округ, 630099, Россия', 'address': {'house_number': '4', 'road': 'Потанинская улица', 'suburb': 'Депутатский', 'city_district': 'Центральный район', 'city': 'Новосибирск', 'county': 'городской округ Новосибирск', 'state': 'Новосибирская область', 'ISO3166-2-lvl4': 'RU-NVS', 'region': 'Сибирский федеральный округ', 'postcode': '630099', 'country': 'Россия', 'country_code': 'ru'}, 'boundingbox': ['55.0331815', '55.0334096', '82.9161502', '82.9169888']}
apar

In [12]:
# Display zone, town and country of the place with the most dangerous air
high_idx = df["AQI Value"].idxmax()
high_value = max(df["AQI Value"])
high_coordinates = df.loc[high_value, ["lat", "lng"]].tolist()

print(f"Values over 200 are considered hazardous.\n"
f"The place with the most dangerous air in the dataset, "
f"with an AQI value of {high_value} is {getAddress(high_coordinates)}.")

Values over 200 are considered hazardous.
The place with the most dangerous air in the dataset, with an AQI value of 500 is tertiary area in Tamil Nadu, India.


In [13]:
# Display zone, towns and countries of the 10 places with the healthiest air
sorted_df = df.sort_values(by="AQI Value").reset_index()

print("The places with the best air quality for human health in the dataset are:\n")
for i in range(10):
  # Loop to avoid connection warnings
  while True:
    try:
      aqi_value = sorted_df.at[i, "AQI Value"]
      coordinates = sorted_df.loc[i , ["lat", "lng"]].tolist()
    except:
      continue
    break

  address = getAddress(coordinates)
  print(f'{i+1}. {address}, with a value of {aqi_value}.')

The places with the best air quality for human health in the dataset are:

1. unclassified area in Morona Santiago, Ecuador, with a value of 7.
2. aerodrome area in Hela, Papua Niugini, with a value of 8.
3. residential area in Azogues, Ecuador, with a value of 8.
4. residential area in Ancash, Perú, with a value of 9.
5. unclassified area in Papua Barat, Indonesia, with a value of 10.
6. place_of_worship area in Nueva Loja, Ecuador, with a value of 10.
7. pharmacy area in Huancavelica, Perú, with a value of 10.
8. residential area in Correntina, Brasil, with a value of 11.
9. residential area in Huamachuco, Perú, with a value of 11.
10. house area in Chubut, Argentina, with a value of 11.


In [14]:
# Display zone, towns and countries of the 10 places with the most dangerous air
sorted_df = df.sort_values(by="AQI Value", ascending=False).reset_index()

print("The places with the best air quality for health in the dataset are:\n")
for i in range(10):
  while True:
    try:
      aqi_value = sorted_df.at[i, "AQI Value"]
      coordinates = sorted_df.loc[i , ["lat", "lng"]].tolist()
    except:
      continue
    break

  address = getAddress(coordinates)
  print(f'{i+1}. {address}, with a value of {aqi_value}.')

The places with the best air quality for health in the dataset are:

1. unclassified area in Haryana, India, with a value of 500.
2. residential area in Rajasthan, India, with a value of 500.
3. trunk area in Тында, Россия, with a value of 500.
4. yes area in Zürich, Schweiz/Suisse/Svizzera/Svizra, with a value of 500.
5. house area in Colorado, United States, with a value of 500.
6. yes area in Durango, España, with a value of 500.
7. residential area in Durango, México, with a value of 500.
8. unclassified area in Punjab, India, with a value of 500.
9. car area in California, United States, with a value of 500.
10. secondary area in Delhi, India, with a value of 500.


In [None]:
# Find how many metrics there are per city
# Does not work because of latency of geolocator library

countries_column = []

"""for i in range(len(df)):
  coordinates = sorted_df.loc[i , ["lat", "lng"]].tolist()
  while True:
    try:
      location = geolocator.reverse(coordinates)
    except:
      continue
    break
  country = location.address
  countries_column.append(country)"""

df['Country'] = countries_column



In [None]:
print(countries_column).value_counts()