# THE PROBLEM

1. In cities like Delhi, staying out could be equivalent to smoking 50 cigarettes. This problem is because of smog and air pollution. 
2. Air pollution is the single biggest killer in the world, with some pollutants reducing avg. global life expectancy by 2.2 years. 
3. There are >5 million deaths from smog a year. 
4. Global economy lost USD 29 trillion in 2018 according to the World Economic Forum because of air pollution.

What people do not understand is that this problem could easily be ended if some international laws can be followed. These mainly include cap and trade schemes for emissions. These are called ETS by the United Nations and are used by many countries in the world. However, what does not allow this to work is a very complicated problem - the problem of tracking pollutants. If a factory pollutes, it is very difficult to track the pollutants back to them.

# MY SOLUTION: SMOG-PIRATE

Smog-Pirate is an inventive, data-driven solution to the above problem. First, I studied years of weather and Air pollution data and found correlations between some factors. Then, using multiple regression and adjusting the number of factors, I was able to forecast AQI (Air Quality Index) by >97% efficiency. I started with 17 inputs and ended with 4. I evaluated and 

I compare the AQI to the actual AQI and if it is much larger, I track the pollutants to diffferent factories based on wind velocities in the last day.

There are many complex steps that I had to go through before making this project. The first was to scrape the web for data for weather and live AQI. I did this through Beautiful Soup 4 and requests.

Then, I tried to make a novel algorithm for tracking pollutants, and it worked really well. I changed the wind velocity from the last day to vectors and added them to find coordinates. I then referenced known coordinates of factories to give a list of polluters. I also had to use an ipinfo.io API for finding my current coordinates.

I then compared AQI's and prescribed a certain penalty to polluters.


In [1]:
#Importing required libraries
import json
from urllib.request import urlopen
import requests
from sklearn import linear_model
from sklearn.metrics import r2_score
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from sklearn.model_selection import train_test_split
import math
import pandas

industry_coords=[['A',34,-84],['A1',0,0],['A-1',34,-84]]
industry_coords_copy=industry_coords
a=[]
a0=[]

#Data from web-scraping starts

#Finding wind speeds for last day for tracking pollutants
page = requests.get('https://weather.com/en-IN/weather/hourbyhour/l/cc76c08b470b5ddd6e64efd9ce8f256542cfed4ba52f6c00a30a74da519cd070')
soup = BeautifulSoup(page.content, 'html.parser')
blog=soup.findAll('span',attrs={"class":"Wind--windWrapper--3Ly7c undefined"})

for title in blog:
    k=title.text
    a.append(k.split(' ')[0])
    a0.append(float(k.split(' ')[1]))
    print(k)

#Finding Humidity
page = requests.get('https://weather.com/en-IN/weather/today/l/cc76c08b470b5ddd6e64efd9ce8f256542cfed4ba52f6c00a30a74da519cd070')
soup = BeautifulSoup(page.content, 'html.parser')
blog=soup.findAll('span',attrs={"data-testid":"PercentageValue"})
for title in blog:
    k=title.text
avg3=float(k.replace("%",""))   
print(avg3)

#Finding Dew Point
blog=soup.findAll('span',attrs={"data-testid":"TemperatureValue"})
i=0
for title in blog:
    k=title.text
    if(i>=10):
        break
    i=i+1
avg2=float(k.replace("°",""))

#Finding avg. temp
blog=soup.findAll('span',attrs={"data-testid":"TemperatureValue"})
i=0
avg1=0
for title in blog:
    k=title.text
    if(i==1 or i==2):
        avg1=avg1+float(k.replace("°",""))
    i=i+1
avg1=avg1/2

#Finding Pressure
blog=soup.findAll('span',attrs={"data-testid":"PressureValue"})
i=0
for title in blog:
    k=title.text
avg5=float(k.replace(" mb","").replace("Arrow Up","").replace("Arrow Down",""))
print(avg5)

#Creating model and extracting AQI and weather training data
x2=a0
y2=[]
df = pandas.read_csv("AQI.csv")
x0=json.load(urlopen("https://ipinfo.io/"))['loc'].split(',')
# find all table with class-"twc-table"
x1=float(x0[0])
y=float(x0[1])
X = df[['Avg1', 'Avg2','Avg3','Avg5']].values
#print(X)
y1 = df['AQI']
#print(y)
regr = linear_model.LinearRegression()
regr.fit(X,y1)
x=regr.predict([[9/5*avg1+32,9/5*avg2+32,avg3,avg5*0.02953]])

#Printing predicted AQI
if x<50:
    print('Good')
elif x<101:
    print('Moderte')
elif x<150:
    print("Unhealthy for Sensitive People")
elif x<200:
    print("Unhealthy")
elif x<300:
    print("Hazardous")
else:
    print("Very Hazardous")
 
#Converting past wind to vectors to track pollutants
for i in a:
    if i=='N':
        y2.append(0)
    elif i=='NW':
        y2.append(-45)
    elif i=='WNW':
        y2.append(-67.5)
    elif i=='NNW':
        y2.append(-22.5)
    elif i=='NE':
        y2.append(45)
    elif i=='ENE':
        y2.append(67.5)
    elif i=='NNE':
        y2.append(22.5)
    elif i=='S':
        y2.append(180)
    elif i=='SW':
        y2.append(225)
    elif i=='WSW':
        y2.append(237.5)
    elif i=='SSW':
        y2.append(202.5)
    elif i=='SE':
        y2.append(135)
    elif i=='ESE':
        y2.append(112.5)
    elif i=='SSE':
        y2.append(157.5)
    elif i=='W':
        y2.append(-90)
    elif i=="E":
        y2.append(90)
#Converting
for i in range(len(y2)):
    y2[i]=y2[i]*math.pi/180
for i in range(len(x2)):
    x1=x1-(y2[i]*math.cos(x2[i]))*1/54.6
    y=y-(y2[i]*math.sin(x2[i]))*1/54.6
    
#Finding distance from pollution center
A1=[]
for i in industry_coords:
    A1.append((((i[1]-x1)**2+(i[2]-y)**2)**(1/2))*1/54.6)
A2=A1
A2.sort()

#Finding current AQI and comparing to forecasted API
blog=soup.findAll('text',attrs={"data-testid":"DonutChartValue"})
i=0
for title in blog:
    k=title.text
    if(i>=1):
        break
    i=i+1
AQI=float(k)
print(AQI)
print(x[0])

#Printing possible sources from our calculated coordinates and deciding penalty by comparing AQI
if((AQI-x[0])>50):
    print("Pollution coordinates:")
    print("%.4f"%x1)
    print("%.4f"%y)
    print("Most probable pollution sources in descending order of possibility:")
    for i in A2:
        print(industry_coords_copy[A1.index(i)][0])
        A1[A1.index(i)]=0
    print("Penalty as percentage of revenue:")
    print((AQI-x[0])/x[0]*100)
else:
    print("No penalty required")

WNW 17 km/h
WNW 19 km/h
WNW 19 km/h
WNW 20 km/h
WNW 18 km/h
NW 14 km/h
WNW 13 km/h
WNW 12 km/h
WNW 12 km/h
NW 12 km/h
WNW 12 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 14 km/h
WNW 15 km/h
WNW 15 km/h
WNW 15 km/h
WNW 17 km/h
WNW 18 km/h
WNW 20 km/h
WNW 23 km/h
WNW 24 km/h
WNW 25 km/h
WNW 26 km/h
WNW 26 km/h
WNW 26 km/h
WNW 24 km/h
WNW 21 km/h
WNW 18 km/h
WNW 16 km/h
WNW 15 km/h
WNW 15 km/h
WNW 15 km/h
WNW 14 km/h
WNW 14 km/h
WNW 14 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 15 km/h
WNW 16 km/h
WNW 17 km/h
WNW 17 km/h
WNW 18 km/h
25.0
1017.3
Hazardous
186.0
211.35718980985735
No penalty required


In [2]:
#Importing required libraries
import json
from urllib.request import urlopen
import requests
from sklearn import linear_model
from sklearn.metrics import r2_score
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from sklearn.model_selection import train_test_split
import math
import pandas

In [3]:
industry_coords=[['A',34,-84],['A1',0,0],['A-1',34,-84]]
industry_coords_copy=industry_coords
a=[]
a0=[]

In [4]:
#Data from web-scraping starts

#Finding wind speeds for last day for tracking pollutants
page = requests.get('https://weather.com/en-IN/weather/hourbyhour/l/cc76c08b470b5ddd6e64efd9ce8f256542cfed4ba52f6c00a30a74da519cd070')
soup = BeautifulSoup(page.content, 'html.parser')
blog=soup.findAll('span',attrs={"class":"Wind--windWrapper--3Ly7c undefined"})

for title in blog:
    k=title.text
    a.append(k.split(' ')[0])
    a0.append(float(k.split(' ')[1]))
    print(k)

WNW 17 km/h
WNW 19 km/h
WNW 19 km/h
WNW 20 km/h
WNW 18 km/h
NW 14 km/h
WNW 13 km/h
WNW 12 km/h
WNW 12 km/h
NW 12 km/h
WNW 12 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 14 km/h
WNW 15 km/h
WNW 15 km/h
WNW 15 km/h
WNW 17 km/h
WNW 18 km/h
WNW 20 km/h
WNW 23 km/h
WNW 24 km/h
WNW 25 km/h
WNW 26 km/h
WNW 26 km/h
WNW 26 km/h
WNW 24 km/h
WNW 21 km/h
WNW 18 km/h
WNW 16 km/h
WNW 15 km/h
WNW 15 km/h
WNW 15 km/h
WNW 14 km/h
WNW 14 km/h
WNW 14 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 13 km/h
WNW 15 km/h
WNW 16 km/h
WNW 17 km/h
WNW 17 km/h
WNW 18 km/h


In [5]:
#Finding Humidity
page = requests.get('https://weather.com/en-IN/weather/today/l/cc76c08b470b5ddd6e64efd9ce8f256542cfed4ba52f6c00a30a74da519cd070')
soup = BeautifulSoup(page.content, 'html.parser')
blog=soup.findAll('span',attrs={"data-testid":"PercentageValue"})
for title in blog:
    k=title.text
avg3=float(k.replace("%",""))   
print(avg3)

25.0


In [6]:
#Finding Dew Point
blog=soup.findAll('span',attrs={"data-testid":"TemperatureValue"})
i=0
for title in blog:
    k=title.text
    if(i>=10):
        break
    i=i+1
avg2=float(k.replace("°",""))

In [7]:
#Finding avg. temp
blog=soup.findAll('span',attrs={"data-testid":"TemperatureValue"})
i=0
avg1=0
for title in blog:
    k=title.text
    if(i==1 or i==2):
        avg1=avg1+float(k.replace("°",""))
    i=i+1
avg1=avg1/2

In [8]:
#Finding Pressure
blog=soup.findAll('span',attrs={"data-testid":"PressureValue"})
i=0
for title in blog:
    k=title.text
avg5=float(k.replace(" mb","").replace("Arrow Up","").replace("Arrow Down",""))
print(avg5)

1017.3


In [9]:
#Creating model and extracting AQI and weather training data
x2=a0
y2=[]
df = pandas.read_csv("AQI.csv")
x0=json.load(urlopen("https://ipinfo.io/"))['loc'].split(',')
# find all table with class-"twc-table"
x1=float(x0[0])
y=float(x0[1])
X = df[['Avg1', 'Avg2','Avg3','Avg5']].values
#print(X)
y1 = df['AQI']
#print(y)
regr = linear_model.LinearRegression()
regr.fit(X,y1)
x=regr.predict([[9/5*avg1+32,9/5*avg2+32,avg3,avg5*0.02953]])

In [10]:
#Printing predicted AQI
if x<50:
    print('Good')
elif x<101:
    print('Moderte')
elif x<150:
    print("Unhealthy for Sensitive People")
elif x<200:
    print("Unhealthy")
elif x<300:
    print("Hazardous")
else:
    print("Very Hazardous")

Hazardous


In [11]:
#Converting past wind to vectors to track pollutants
for i in a:
    if i=='N':
        y2.append(0)
    elif i=='NW':
        y2.append(-45)
    elif i=='WNW':
        y2.append(-67.5)
    elif i=='NNW':
        y2.append(-22.5)
    elif i=='NE':
        y2.append(45)
    elif i=='ENE':
        y2.append(67.5)
    elif i=='NNE':
        y2.append(22.5)
    elif i=='S':
        y2.append(180)
    elif i=='SW':
        y2.append(225)
    elif i=='WSW':
        y2.append(237.5)
    elif i=='SSW':
        y2.append(202.5)
    elif i=='SE':
        y2.append(135)
    elif i=='ESE':
        y2.append(112.5)
    elif i=='SSE':
        y2.append(157.5)
    elif i=='W':
        y2.append(-90)
    elif i=="E":
        y2.append(90)

In [12]:
#Converting
for i in range(len(y2)):
    y2[i]=y2[i]*math.pi/180
for i in range(len(x2)):
    x1=x1-(y2[i]*math.cos(x2[i]))*1/54.6
    y=y-(y2[i]*math.sin(x2[i]))*1/54.6

In [13]:
#Finding distance from pollution center
A1=[]
for i in industry_coords:
    A1.append((((i[1]-x1)**2+(i[2]-y)**2)**(1/2))*1/54.6)
A2=A1
A2.sort()

In [14]:
#Finding current AQI and comparing to forecasted API
blog=soup.findAll('text',attrs={"data-testid":"DonutChartValue"})
i=0
for title in blog:
    k=title.text
    if(i>=1):
        break
    i=i+1
AQI=float(k)
print(AQI)
print(x[0])

186.0
211.35718980985735


In [15]:
#Printing possible sources from our calculated coordinates and deciding penalty by comparing AQI
if((AQI-x[0])>50):
    print("Pollution coordinates:")
    print("%.4f"%x1)
    print("%.4f"%y)
    print("Most probable pollution sources in descending order of possibility:")
    for i in A2:
        print(industry_coords_copy[A1.index(i)][0])
        A1[A1.index(i)]=0
    print("Penalty as percentage of revenue:")
    print((AQI-x[0])/x[0]*100)

In [None]:
#Testing model accuracy
from sklearn import linear_model
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import pandas
df = pandas.read_csv("AQI.csv")
Y_val=[]
x_val=[]
x0=0
x1=0
x2=0
x3=0
x4=0
x5=0
X0=0
X1=0
X2=0
X3=0
X4=0
X5=0
for i in range(1000):
    X = df[['Avg1','Avg2','Avg3','Avg5']].values
    #print(X)
    y1 = df['AQI']
    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2) 
    #print(y)
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    X=0
    x=0
    for y in range(len(X_test)-1):
        i=regr.predict([X_test[y]])
        if i<50:
            if(y_test.values[y]<50):
                X=X+1
                X0=X0+1
            x0=x0+1
        elif i<101:
            if(y_test.values[y]<101):
                X=X+1
                X1=X1+1
            x1=x1+1
        elif i<150:
            if(y_test.values[y]<150):
                X=X+1
                X2=X2+1
            x2=x2+1
        if i<200:
            if(y_test.values[y]<200):
                X=X+1
                X3=X3+1
            x3=x3+1
        elif i<300:
            if(y_test.values[y]<300):
                X=X+1
                X4=X4+1
            x4=x4+1
        else:
            X=X+1
            X5=X5+1
            x5=x5+1
    x=len(X_test)
    Y_val.append(X/x)
    x_val.append(i)
import matplotlib.pyplot as plt
y=[X3/x3,X4/x4,X5/x5]
x=['Level 4', 'Level 5', 'Toxic']
plt.bar(x, y)
print(sum(Y_val)/len(Y_val))


# 