# Scrape Data From Wikipedia Page:  Wikipedia:Size of Wikipedia

### Author: <font color='red'> Ben Lenox</font>

In [80]:
import urllib.request
from bs4 import BeautifulSoup
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://en.wikipedia.org/wiki/Wikipedia:Size_of_Wikipedia"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

In [86]:
from datetime import datetime, timedelta
import re

# Select first wikitable on page
wikipedia_growthrate = soup.find_all('table', class_='wikitable')[0]

dates, article_count, article_increase, percent_increase, doubling_time, avg_increase_per_day = [], [], [], [], [], []
for row in wikipedia_growthrate.find_all('tr'):
    data = row.find_all('td')
    if len(data) == 6 and all(x.text != "—" for x in data):
        
        # converts date data from table to datetime and adds it to list
        dates.append(datetime.strptime(data[0].text.strip(), "%Y-%m-%d"))
        
        # converts article count data from table to int and adds it to list
        article_count.append(int(data[1].text.strip().replace(",", "")))
        
        # converts article increase data from table to int and adds it to list
        article_increase.append(int(data[2].find(text=True).strip().replace(",", "")))
        
        # converts percentage data from table to float and adds it to list
        percent_increase.append(float(data[3].find(text=True).replace("%", "")))
        
        # converts string of double time to timedelta object and adds it to list
        double_time_regex = re.search("((\d+) years?, )?(\d+) days?|(\d+) years?", data[4].text.replace("\xa0", " "))
        if double_time_regex.groups()[3]:
            double_time = timedelta(days=(365 * int(double_time_regex.groups()[3])))
        else:
            days = int(double_time_regex.groups()[2])
            if double_time_regex.groups()[1]:
                days += int(double_time_regex.groups()[1]) * 365
            double_time = timedelta(days=days)
        doubling_time.append(double_time)
        
        # converts average increase/day data to int and adds it to list
        avg_increase_per_day.append(int(data[5].find(text=True).strip()))
        

In [87]:
import pandas as pd

df = pd.DataFrame({'Date': dates, 'Article Count': article_count,
                   'Article Increase': article_increase, 'Percent Increase': percent_increase,
                  'Doubling Time': doubling_time, 'Avg Increase in Articles per Day': avg_increase_per_day})

# The instructions said to print at least the first 10 rows, but the data set is only 20 rows, so I just printed it all
print(df)

         Date  Article Count  Article Increase  Percent Increase  \
0  2003-01-01          96500             76800            390.00   
1  2004-01-01         188800             92300             96.00   
2  2005-01-01         438500            249700            132.00   
3  2006-01-01         895000            456500            104.00   
4  2007-01-01        1560000            665000             74.00   
5  2008-01-01        2153000            593000             38.00   
6  2009-01-01        2679000            526000             24.00   
7  2010-01-01        3144000            465000             17.00   
8  2011-01-01        3518000            374000             12.00   
9  2012-01-01        3835000            317000              9.00   
10 2013-01-01        4133000            298000              8.00   
11 2014-01-01        4413000            280000              7.00   
12 2015-01-01        4682000            269000              6.00   
13 2016-01-01        5045000            363000  

In [84]:
# Save dataframe data as csv
df.to_csv("CSC221-webscrape-data.csv")