# Web Scrapping COVID-19 data from Worldometer website and analyzing the data.

In [6]:
#IMPORTING LIBRARIES
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import plotly.graph_objs as go
from plotly.plotly import iplot
import plotly as py
from datetime import datetime
py.tools.set_credentials_file(username='Bidap.a', api_key='DR0sCpf1qd6cqn1ktoB0')
pd.set_option('display.max_rows', 5000)

In [7]:
#scraping data.
def scrap_table(link,id):
    results = requests.get(link)
    print(results)
    src = results.content
    soup = BeautifulSoup(src,'lxml')
    table_tag = soup.find("table", {"id" : id})
    columns = [th.text for th in table_tag.findAll('th')]
    rows_tags = table_tag.tbody.findAll('tr')
    rows = []
    for row_tag in rows_tags:
        row = [col.text for col in row_tag.findAll('td')]
        rows.append(row)
    my_data = pd.DataFrame(rows,columns=columns)
    return my_data

In [8]:
#data cleaning
def clean_my_data(to_clean_data):
    for i in to_clean_data:
        to_clean_data[i] = [x.strip('\n') for x in to_clean_data[i]]
        to_clean_data[i] = [x.strip('+') for x in to_clean_data[i]]
        to_clean_data[i] = to_clean_data[i].str.replace(',','')
    to_clean_data = to_clean_data.replace(r'^\s*$', np.nan, regex=True)
    to_clean_data.replace(np.nan,0,inplace=True)
    return to_clean_data

# World COVID DATA ANALYSIS AS OF TODAY..

In [9]:
my_data = scrap_table("https://www.worldometers.info/coronavirus/",'main_table_countries_today')

<Response [200]>


In [10]:
my_data = clean_my_data(my_data)

In [11]:
for i in my_data:
    if i not in ['Country,Other','Reported1st case']:
        my_data[i]= my_data[i].apply(pd.to_numeric)

In [31]:
ships = ['Diamond Princess','MS Zaandam']
my_data_ships = my_data[my_data['Country,Other'].isin(ships)]
my_data = my_data[~my_data['Country,Other'].isin(ships)]

my_data1 = my_data.sort_values('TotalCases',ascending=False).nlargest(30,'TotalCases')
my_data2 = my_data.sort_values('NewCases',ascending=False).nlargest(30,'NewCases')

In [32]:
my_data1 = my_data.sort_values('TotalCases',ascending=False).nlargest(30,'TotalCases')
# create trace1 
trace1 = go.Bar(
                x = my_data1['Country,Other'],
                y = my_data1['TotalCases'],
                name = "TotalCases",
                marker = dict(color = 'rgb(227,26,28)',
                             line=dict(color='rgb(227,26,28)',width=1.5)),
                text = my_data1['Country,Other'])
trace2 = go.Bar(
                x = my_data1['Country,Other'],
                y = my_data1['TotalDeaths'],
                name = "TotalDeaths",
                marker = dict(color = 'rgb(51,160,44)',
                             line=dict(color='rgb(51,160,44)',width=1.5)),
                text = my_data1['Country,Other'])
trace3 = go.Bar(
                x = my_data1['Country,Other'],
                y = my_data1['TotalRecovered'],
                name = "TotalRecovered",
                marker = dict(color = 'rgb(166,206,300)',
                             line=dict(color='rgb(166,206,227)',width=1.5)),
                text = my_data1['Country,Other'])

trace4 = go.Bar(
                x = my_data2['Country,Other'],
                y = my_data2['NewCases'],
                name = "NewCases",
                marker = dict(color = 'rgb(227,26,28)',
                             line=dict(color='rgb(227,26,28)',width=1.5)),
                text = my_data2['Country,Other'])
trace5 = go.Bar(
                x = my_data2['Country,Other'],
                y = my_data2['NewDeaths'],
                name = "NewDeaths",
                marker = dict(color = 'rgb(51,160,44)',
                             line=dict(color='rgb(51,160,44)',width=1.5)),
                text = my_data2['Country,Other'])

In [33]:
data = [trace1,trace2,trace3]
layout = go.Layout(barmode = "group",title = 'Values for Top 30 Majorly Impacted Countries.')
fig = go.Figure(data = data, layout = layout)
iplot(fig)


Consider using IPython.display.IFrame instead



# VALUES FOR TOP 30 CASES FOR TODAY..!!

In [34]:
data = [trace4,trace5]
layout = go.Layout(barmode = "group",title='Values for Top 30 Country Cases for Today.')
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Percentage Wise Analysis of the Total COVID-19 Cases.

In [35]:
#pie plot of Active Recovered and Death Cases.
Active_cases = my_data.ActiveCases.sum()
total_recover = my_data.TotalRecovered.sum()
total_deaths = my_data.TotalDeaths.sum()

total = ['Active Cases', 'Total Deaths', 'Total Recovered']
values = [Active_cases,total_deaths,total_recover]

import plotly.graph_objs as go
data1 = {
   "values": values,
   "labels": total,
   "domain": {"column": 0},
   "name": "COVID-19 Cases",
   #"hoverinfo":"label+percent+name",
   "hole": .4,
   "type": "pie"
}
data = [data1]
layout = go.Layout(
   {
      "title":"Total COVID-19 Cases",
      "grid": {"rows": 1, "columns": 2},
      "annotations": [
         {
            "font": {
               "size": 20
            },
            "showarrow": False,
            "text":'Total Cases',
            "x": 0.16,
            "y": 0.5
         },
         {
            "font": {
               "size": 20
            },
            "showarrow": False,
            "text": "",
            "x": 0.8,
            "y": 0.5
         }
      ]
   }
)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Further Targeting the Country with maximum Cases:  UNITED STATES

In [36]:
us_data_today = scrap_table('https://www.worldometers.info/coronavirus/country/us/','usa_table_countries_today')

<Response [200]>


In [37]:
us_data = clean_my_data(us_data_today)

In [38]:
country = us_data.columns[0]

trace6 = go.Bar(
                x = us_data[country],
                y = us_data['TotalCases'],
                name = "TotalCases",
                marker = dict(color = 'rgb(227,26,28)',
                             line=dict(color='rgb(227,26,28)',width=1.5)),
                text = us_data[country])
trace7 = go.Bar(
                x = us_data[country],
                y = us_data['TotalDeaths'],
                name = "TotalDeaths",
                marker = dict(color = 'rgb(51,160,44)',
                             line=dict(color='rgb(51,160,44)',width=1.5)),
                text = us_data[country])
trace8 = go.Bar(
                x = us_data[country],
                y = us_data['NewCases'],
                name = "NewCases",
                marker = dict(color = 'rgb(51,160,44)',
                             line=dict(color='rgb(51,160,44)',width=1.5)),
                text = us_data[country])
trace9 = go.Bar(
                x = us_data[country],
                y = us_data['NewDeaths'],
                name = "NewDeaths",
                marker = dict(color = 'rgb(166,206,300)',
                             line=dict(color='rgb(166,206,300)',width=1.5)),
                text = us_data[country])

data = [trace6,trace7]
layout = go.Layout(barmode = "group",title='United States: Total Data Till Date.')
fig = go.Figure(data = data, layout = layout)
iplot(fig)

1. New York being the Most widely impacted State in the United States.

In [21]:
data = [trace8,trace9]
layout = go.Layout(barmode = "group",title='United States: New Cases Data for Today.')
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Countries Working In Full Fledge and Recovering 50% of the Total Impacted.

In [39]:
data_recovered_filtered = my_data.loc[((my_data['TotalRecovered'])/my_data['TotalCases']) > 0.5,['Country,Other','TotalCases','TotalRecovered']]

In [40]:
data_recovered_filtered = data_recovered_filtered.sort_values('TotalCases',ascending=False)

In [41]:
data_recovered_filtered['TotalRemaining'] = data_recovered_filtered['TotalCases'] - data_recovered_filtered['TotalRecovered']

In [48]:
data_recovered_filtered = data_recovered_filtered.sort_values('TotalRemaining')

In [43]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(data_recovered_filtered.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[data_recovered_filtered['Country,Other'],data_recovered_filtered['TotalCases'],
                       data_recovered_filtered['TotalRecovered'],data_recovered_filtered['TotalRemaining']],
               fill_color='lavender',
               align='left'))
])

iplot(fig)

# With the above Country data also impacted cases are found in the 2 cruises.

# Diamond Princess :
1.British-registered cruise ship owned and operated by Princess Cruises.<br>
2.Approx 3,711 passengers and crew members on board.

# MS Zaandam:
1.Cruise ship owned and operated by Holland America Line.<br>


In [62]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(my_data_ships.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[my_data_ships['Country,Other'],my_data_ships['TotalCases'],
                       my_data_ships['NewCases'],my_data_ships['TotalDeaths'],my_data_ships['NewDeaths'],my_data_ships['TotalRecovered']],
               fill_color='lavender',
               align='left'))
])

iplot(fig)


Consider using IPython.display.IFrame instead



In [58]:
my_data_ships

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop
60,Diamond Princess,712,0,11,0,619,82,10,0.0,0.0
171,MS Zaandam,9,0,2,0,0,7,0,0.0,0.0
