In [69]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPCookieProcessor, build_opener
import pandas as pd
import re

# Need to implement cookies to avoid infinite redirect loop
# https://stackoverflow.com/questions/32569934/urlopen-returning-redirect-error-for-valid-links
from http.cookiejar import CookieJar

## Experiment

In [2]:
url = 'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10/5c0b9368-a9e8-4238-b1e4-45f4b7ed2fb9'
cj = CookieJar()
opener = build_opener(HTTPCookieProcessor(cj))
page = opener.open(url)
html = page.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser', )

In [3]:
paragraphs = soup.find_all('p')
paragraphs[17]

<p><strong>What is Windows 10?</strong></p>

In [4]:
paragraphs[17:25]

[<p><strong>What is Windows 10?</strong></p>,
 <p><span>Windows 10 is the name for Microsoft's next generation client operating system.
 </span></p>,
 <p>
 </p>,
 <p><strong>What happened to Windows 9?</strong></p>,
 <p><span>There was never a Windows 9, Microsoft skipped this number and went to 10 instead. Company officials decided to choose 10 because they believe this release is a 'Perfect 10' and to also signify this is the last major release of Windows. There will
  still be revisions of Windows 10 called Feature Updates, just don't expect there to be a Windows 11, Windows 12 etc. Initially revealed at BUILD 2014 as a minor update called ‘Threshold’, Windows 10 was originally intended be a minor Update 2 for Windows 8.1.
  Somewhere along the way, plans changed and the operating system evolved into the major revision we have today.</span></p>,
 <p>
 </p>,
 <p><strong>When will Windows 10 officially become available? </strong></p>,
 <p>Microsoft launched Windows 10 world wide on <s

In [26]:
bad_questions = [
    'How satisfied are you with this comment?', 
    'How satisfied are you with this article?'
]

In [27]:
def question_filter(p:str) -> bool:
    '''
    questions must exist, end with a '?', and not be in the list of 
    bad questions.
    '''
    return bool(p.text) and p.text[-1] == '?' and all([s not in p for s in bad_questions])

In [28]:
questions = [
    (i, paragraph.text )
    for i, paragraph in enumerate(paragraphs) 
    if question_filter(paragraph)
]

In [31]:
Qs = pd.DataFrame(questions, columns = ['idx', 'question'])
Qs

Unnamed: 0,idx,question
0,17,What is Windows 10?
1,20,What happened to Windows 9?
2,32,What is the build number of the public release?
3,38,How do I know which build of Windows 10 I have...
4,46,What does RTM mean?
...,...,...
68,466,I did a clean install of Windows 10 but cannot...
69,469,How does Windows 10 handle privacy?
70,481,Where can I get more information about Windows...
71,542,Q: will the DVD-upgrade-version in fact be a ...


In [32]:
Qs.head(30)

Unnamed: 0,idx,question
0,17,What is Windows 10?
1,20,What happened to Windows 9?
2,32,What is the build number of the public release?
3,38,How do I know which build of Windows 10 I have...
4,46,What does RTM mean?
5,49,Does Windows 10 require a Touch Screen?
6,52,Is the Start Screen still available in Windows...
7,55,Have the touch features been removed from Wind...
8,58,Will I need to be connected to the Internet in...
9,80,I am currently dual booting Windows and Linux;...


In [44]:
Qs['answer_idx_range'] = [
    (row.idx+1, Qs.loc[i+1, 'idx']) 
    if i < Qs.shape[0] -1 
    else (row.idx+1, -1)
    for i, row in Qs.iterrows() 
]

In [45]:
Qs

Unnamed: 0,idx,question,answer_idx_range
0,17,What is Windows 10?,"(18, 20)"
1,20,What happened to Windows 9?,"(21, 32)"
2,32,What is the build number of the public release?,"(33, 38)"
3,38,How do I know which build of Windows 10 I have...,"(39, 46)"
4,46,What does RTM mean?,"(47, 49)"
...,...,...,...
68,466,I did a clean install of Windows 10 but cannot...,"(467, 469)"
69,469,How does Windows 10 handle privacy?,"(470, 481)"
70,481,Where can I get more information about Windows...,"(482, 542)"
71,542,Q: will the DVD-upgrade-version in fact be a ...,"(543, 595)"


In [51]:
Qs['answer'] = [
    ' '.join(p.text for p in paragraphs[start:stop])
    for i, (start, stop) in Qs.answer_idx_range.iteritems()
]

In [55]:
Qs.drop(columns = ['idx', 'answer_idx_range']).to_csv('../data/scraped_QAs/microsoft_faq_1.csv')

In [56]:
def scrape_site(url:str):
    cj = CookieJar()
    opener = build_opener(HTTPCookieProcessor(cj))
    page = opener.open(url)
    html = page.read().decode('utf-8')
    return BeautifulSoup(html, 'html.parser', )

In [76]:
def remove_whitespace(s:str) -> str:
    return s.replace('\n', ' ').replace('\r', ' ').strip()
    
def find_questions_answers(soup) -> pd.DataFrame:
    paragraphs = soup.find_all('p')
    questions = [
        (i, remove_whitespace(paragraph.text))
        for i, paragraph in enumerate(paragraphs) 
        if question_filter(paragraph)
    ]

    QA = pd.DataFrame(questions, columns = ['idx', 'question'])
    QA['answer_idx_range'] = [
        (row.idx+1, QA.loc[i+1, 'idx']) 
        if i < QA.shape[0] -1 
        else (row.idx+1, -1)
        for i, row in QA.iterrows() 
    ]
    QA['answer'] = [
        ' '.join(remove_whitespace(p.text) for p in paragraphs[start:stop])
        for i, (start, stop) in QA.answer_idx_range.iteritems()
    ]
    return QA.drop(columns = ['idx', 'answer_idx_range'])

In [80]:
urls = [
    'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10/5c0b9368-a9e8-4238-b1e4-45f4b7ed2fb9',
    'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10-part-2/ef6cdc30-ee04-4a0e-ae13-03fa41350452',
    'https://answers.microsoft.com/en-us/insider/forum/all/frequently-asked-questions-windows-10-anniversary/c3778392-8fcf-4e32-b652-d8e187cc48e0'
]
dfs = []
for url in urls:
    soup = scrape_site(url)
    dfs.append(find_questions_answers(soup))

QA = pd.concat(dfs)


In [81]:
QA

Unnamed: 0,question,answer
0,What is Windows 10?,Windows 10 is the name for Microsoft's next ge...
1,What happened to Windows 9?,"There was never a Windows 9, Microsoft skipped..."
2,What is the build number of the public release?,Build 10240 Build 10586 Build 14393 Build 1506...
3,How do I know which build of Windows 10 I have...,Resource: How to Verify If You Downloaded o...
4,What does RTM mean?,RTM means Release to Manufacturing; which is a...
...,...,...
32,Can I dual boot the Anniversary Update and pr...,Yes: How to: setup a dual boot configuration...
33,Suppose I do not like the Windows 10 Anniversa...,How to: Rollback to a previous version of Wi...
34,How can I submit feedback about Windows 10 160...,Click Start then type Feedback then hit Enter....
35,What are some of the key new features coming i...,This is just the tip of the iceberg. Micro...


In [82]:
1799*1.0475

1884.4525

In [84]:
(1799 + 85 + 45 + 70 +23 + 3) * 1.0475

2121.1875