## Web Scrapping And Analysis of "list of most popular website" from Wikipedia

In [1]:
# import necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Request for the HTML response using the URL

wikipedia_url = "https://en.wikipedia.org/wiki/List_of_most_visited_websites"
response = requests.get(wikipedia_url)
print(response.status_code)


200


In [3]:
# parse data from the html into a beautifulsoup object

soup = BeautifulSoup(response.text, "html.parser")
website_table = soup.find("table", {"class":"wikitable"})

In [4]:
website_table

<table class="wikitable sortable mw-datatable" style="width:100%; font-size:96%;">
<tbody><tr>
<th>Site
</th>
<th>Domain Name
</th>
<th data-sort-type="number"><a href="/wiki/Similarweb" title="Similarweb">Similarweb</a> top 50<br/>websites ranking<br/><small>(As of April 1, 2022<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=List_of_most_visited_websites&amp;action=edit">[update]</a></sup>)</small><sup class="reference" id="cite_ref-Similarweb_1-0"><a href="#cite_note-Similarweb-1">[1]</a></sup>
</th>
<th>Category
</th>
<th>Principal country/territory
</th></tr>
<tr>
<td><a href="/wiki/Google_Search" title="Google Search">Google Search</a>
</td>
<td>google.com
</td>
<td>1 (<img alt="Steady" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/96/Steady2.svg/11px-Steady2.svg.png" srcset="//upload.wikimedia.o

In [5]:
# puting the table into a dataframe

popular_website_df = pd.read_html(str(website_table))
df = pd.DataFrame(popular_website_df[0])
df

Unnamed: 0,Site,Domain Name,"Similarweb top 50websites ranking(As of April 1, 2022[1]",Category,Principal country/territory
0,Google Search,google.com,1 (),Computers Electronics and Technology > Search ...,United States
1,YouTube,youtube.com,2 (),Arts & Entertainment > TV Movies and Streaming,United States
2,Facebook,facebook.com,3 (),Computers Electronics and Technology > Social ...,United States
3,Twitter,twitter.com,4 (),Computers Electronics and Technology > Social ...,United States
4,Instagram,instagram.com,5 (),Computers Electronics and Technology > Social ...,United States
5,Baidu,baidu.com,6 (),Computers Electronics and Technology > Search ...,China
6,Wikipedia,wikipedia.org,7 (),Reference Materials > Dictionaries and Encyclo...,United States
7,Yandex,yandex.ru,8 (),Computers Electronics and Technology > Search ...,Russia
8,Yahoo,yahoo.com,9 (),News and Media,United States
9,xVideos,xvideos.com,10 (),Adult content,Czech Republic


## Cleaning the dataframe

In [6]:
# renaming the column for easy access

df = df.rename(columns = {'Similarweb top 50websites ranking(As of April 1, 2022[1]': "SimilarWeb Ranking", "Principal country/territory":"Country"})

df

Unnamed: 0,Site,Domain Name,"Similarweb top 50websites ranking(As of April 1, 2022[1]",Category,Country
0,Google Search,google.com,1 (),Computers Electronics and Technology > Search ...,United States
1,YouTube,youtube.com,2 (),Arts & Entertainment > TV Movies and Streaming,United States
2,Facebook,facebook.com,3 (),Computers Electronics and Technology > Social ...,United States
3,Twitter,twitter.com,4 (),Computers Electronics and Technology > Social ...,United States
4,Instagram,instagram.com,5 (),Computers Electronics and Technology > Social ...,United States
5,Baidu,baidu.com,6 (),Computers Electronics and Technology > Search ...,China
6,Wikipedia,wikipedia.org,7 (),Reference Materials > Dictionaries and Encyclo...,United States
7,Yandex,yandex.ru,8 (),Computers Electronics and Technology > Search ...,Russia
8,Yahoo,yahoo.com,9 (),News and Media,United States
9,xVideos,xvideos.com,10 (),Adult content,Czech Republic


In [7]:
# splitting the category

df[['Category', 'SubCategory']] = df['Category'].str.split(" > ", expand = True)
df

Unnamed: 0,Site,Domain Name,"Similarweb top 50websites ranking(As of April 1, 2022[1]",Category,Country,SubCategory
0,Google Search,google.com,1 (),Computers Electronics and Technology,United States,Search Engines
1,YouTube,youtube.com,2 (),Arts & Entertainment,United States,TV Movies and Streaming
2,Facebook,facebook.com,3 (),Computers Electronics and Technology,United States,Social Networks and Online Communities
3,Twitter,twitter.com,4 (),Computers Electronics and Technology,United States,Social Networks and Online Communities
4,Instagram,instagram.com,5 (),Computers Electronics and Technology,United States,Social Networks and Online Communities
5,Baidu,baidu.com,6 (),Computers Electronics and Technology,China,Search Engines
6,Wikipedia,wikipedia.org,7 (),Reference Materials,United States,Dictionaries and Encyclopedias
7,Yandex,yandex.ru,8 (),Computers Electronics and Technology,Russia,Search Engines
8,Yahoo,yahoo.com,9 (),News and Media,United States,
9,xVideos,xvideos.com,10 (),Adult content,Czech Republic,


In [8]:
df.columns


Index(['Site', 'Domain Name',
       'Similarweb top 50websites ranking(As of April 1, 2022[1]', 'Category',
       'Country', 'SubCategory'],
      dtype='object')

In [9]:
df[["SmilarWeb Rank", "nonsense_to_remove"]] = df.iloc[:,2].str.split(" ", expand = True)
df

Unnamed: 0,Site,Domain Name,"Similarweb top 50websites ranking(As of April 1, 2022[1]",Category,Country,SubCategory,SmilarWeb Rank,nonsense_to_remove
0,Google Search,google.com,1 (),Computers Electronics and Technology,United States,Search Engines,1,()
1,YouTube,youtube.com,2 (),Arts & Entertainment,United States,TV Movies and Streaming,2,()
2,Facebook,facebook.com,3 (),Computers Electronics and Technology,United States,Social Networks and Online Communities,3,()
3,Twitter,twitter.com,4 (),Computers Electronics and Technology,United States,Social Networks and Online Communities,4,()
4,Instagram,instagram.com,5 (),Computers Electronics and Technology,United States,Social Networks and Online Communities,5,()
5,Baidu,baidu.com,6 (),Computers Electronics and Technology,China,Search Engines,6,()
6,Wikipedia,wikipedia.org,7 (),Reference Materials,United States,Dictionaries and Encyclopedias,7,()
7,Yandex,yandex.ru,8 (),Computers Electronics and Technology,Russia,Search Engines,8,()
8,Yahoo,yahoo.com,9 (),News and Media,United States,,9,()
9,xVideos,xvideos.com,10 (),Adult content,Czech Republic,,10,()


In [10]:
df.columns

Index(['Site', 'Domain Name',
       'Similarweb top 50websites ranking(As of April 1, 2022[1]', 'Category',
       'Country', 'SubCategory', 'SmilarWeb Rank', 'nonsense_to_remove'],
      dtype='object')

In [13]:
df.columns = ['Site', 'Domain Name', 'SmilarWeb Rank', 'Category', 'SubCategory', 'Country',  ]

ValueError: Length mismatch: Expected axis has 8 elements, new values have 6 elements