In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPCookieProcessor, build_opener
import pandas as pd
import re

# Need to implement cookies to avoid infinite redirect loop
# https://stackoverflow.com/questions/32569934/urlopen-returning-redirect-error-for-valid-links
from http.cookiejar import CookieJar
pd.options.display.max_colwidth = 200

## Experiment

In [3]:
url = 'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10/5c0b9368-a9e8-4238-b1e4-45f4b7ed2fb9'
cj = CookieJar()
opener = build_opener(HTTPCookieProcessor(cj))
page = opener.open(url)
html = page.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser', )

In [4]:
paragraphs = soup.find_all('p')
paragraphs[17]

<p><strong>What is Windows 10?</strong></p>

In [5]:
paragraphs[17:25]

[<p><strong>What is Windows 10?</strong></p>,
 <p><span>Windows 10 is the name for Microsoft's next generation client operating system.
 </span></p>,
 <p>
 </p>,
 <p><strong>What happened to Windows 9?</strong></p>,
 <p><span>There was never a Windows 9, Microsoft skipped this number and went to 10 instead. Company officials decided to choose 10 because they believe this release is a 'Perfect 10' and to also signify this is the last major release of Windows. There will
  still be revisions of Windows 10 called Feature Updates, just don't expect there to be a Windows 11, Windows 12 etc. Initially revealed at BUILD 2014 as a minor update called ‘Threshold’, Windows 10 was originally intended be a minor Update 2 for Windows 8.1.
  Somewhere along the way, plans changed and the operating system evolved into the major revision we have today.</span></p>,
 <p>
 </p>,
 <p><strong>When will Windows 10 officially become available? </strong></p>,
 <p>Microsoft launched Windows 10 world wide on <s

In [6]:
bad_questions = [
    'How satisfied are you with this comment?', 
    'How satisfied are you with this article?'
]

In [7]:
def question_filter(p:str) -> bool:
    '''
    questions must exist, end with a '?', and not be in the list of 
    bad questions.
    '''
    return bool(p.text) and p.text[-1] == '?' and all([s not in p for s in bad_questions])

In [8]:
questions = [
    (i, paragraph.text )
    for i, paragraph in enumerate(paragraphs) 
    if question_filter(paragraph)
]

In [9]:
Qs = pd.DataFrame(questions, columns = ['idx', 'question'])
Qs

Unnamed: 0,idx,question
0,17,What is Windows 10?
1,20,What happened to Windows 9?
2,32,What is the build number of the public release?
3,38,How do I know which build of Windows 10 I have...
4,46,What does RTM mean?
...,...,...
68,466,I did a clean install of Windows 10 but cannot...
69,469,How does Windows 10 handle privacy?
70,481,Where can I get more information about Windows...
71,542,Q: will the DVD-upgrade-version in fact be a ...


In [10]:
Qs.head(30)

Unnamed: 0,idx,question
0,17,What is Windows 10?
1,20,What happened to Windows 9?
2,32,What is the build number of the public release?
3,38,How do I know which build of Windows 10 I have...
4,46,What does RTM mean?
5,49,Does Windows 10 require a Touch Screen?
6,52,Is the Start Screen still available in Windows...
7,55,Have the touch features been removed from Wind...
8,58,Will I need to be connected to the Internet in...
9,80,I am currently dual booting Windows and Linux;...


In [11]:
Qs['answer_idx_range'] = [
    (row.idx+1, Qs.loc[i+1, 'idx']) 
    if i < Qs.shape[0] -1 
    else (row.idx+1, -1)
    for i, row in Qs.iterrows() 
]

In [12]:
Qs

Unnamed: 0,idx,question,answer_idx_range
0,17,What is Windows 10?,"(18, 20)"
1,20,What happened to Windows 9?,"(21, 32)"
2,32,What is the build number of the public release?,"(33, 38)"
3,38,How do I know which build of Windows 10 I have...,"(39, 46)"
4,46,What does RTM mean?,"(47, 49)"
...,...,...,...
68,466,I did a clean install of Windows 10 but cannot...,"(467, 469)"
69,469,How does Windows 10 handle privacy?,"(470, 481)"
70,481,Where can I get more information about Windows...,"(482, 542)"
71,542,Q: will the DVD-upgrade-version in fact be a ...,"(543, 595)"


In [13]:
Qs['answer'] = [
    ' '.join(p.text for p in paragraphs[start:stop])
    for i, (start, stop) in Qs.answer_idx_range.iteritems()
]

In [14]:
Qs.drop(columns = ['idx', 'answer_idx_range']).to_csv('../data/scraped_QAs/microsoft_faq_1.csv')

In [15]:
def scrape_site(url:str):
    cj = CookieJar()
    opener = build_opener(HTTPCookieProcessor(cj))
    page = opener.open(url)
    html = page.read().decode('utf-8')
    return BeautifulSoup(html, 'html.parser', )

In [16]:
def remove_whitespace(s:str) -> str:
    return s.replace('\n', ' ').replace('\r', ' ').strip()
    
def find_questions_answers(soup) -> pd.DataFrame:
    paragraphs = soup.find_all('p')
    questions = [
        (i, remove_whitespace(paragraph.text))
        for i, paragraph in enumerate(paragraphs) 
        if question_filter(paragraph)
    ]

    QA = pd.DataFrame(questions, columns = ['idx', 'question'])
    QA['answer_idx_range'] = [
        (row.idx+1, QA.loc[i+1, 'idx']) 
        if i < QA.shape[0] -1 
        else (row.idx+1, -1)
        for i, row in QA.iterrows() 
    ]
    QA['answer'] = [
        ' '.join(remove_whitespace(p.text) for p in paragraphs[start:stop])
        for i, (start, stop) in QA.answer_idx_range.iteritems()
    ]
    return QA.drop(columns = ['idx', 'answer_idx_range'])

In [36]:
urls = [
    'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10/5c0b9368-a9e8-4238-b1e4-45f4b7ed2fb9',
    'https://answers.microsoft.com/en-us/windows/forum/all/frequently-asked-questions-windows-10-part-2/ef6cdc30-ee04-4a0e-ae13-03fa41350452',
    'https://answers.microsoft.com/en-us/insider/forum/all/frequently-asked-questions-windows-10-anniversary/c3778392-8fcf-4e32-b652-d8e187cc48e0'
]
dfs = []
for url in urls:
    soup = scrape_site(url)
    dfs.append(find_questions_answers(soup))

QA = pd.concat(dfs)


In [37]:
QA = QA.reset_index(drop = True)

In [39]:
# Initialize the labeling
# QA['intent'] = ''
# i = 0

In [2]:
# To restart the labeling Session
QA = pd.read_csv('../data/scraped_QAs/windows_faq.csv', index_col = 0)
QA.head()
i = 84

In [104]:
QA.shape

(149, 3)

In [3]:
intents = [
    'general',
    'change_settings',
    'navigation',
    'system_info',
    'terminology',
    'requirements',
    'dual_boot',
    'system_upgrade',
    'software_updates',
    'installation',
    'program_compatability',
    'windows_phone',
    'product_keys'
]

intents_hierarchy = {
    'microsoft_policies': {
        'product_keys':{
            'product_key_cost':{}
        },
        'privacy':{},
        'general': {},
        'windows_editions': {},
        'windows_10_builds': {
            'current_build':{},
            'known_problems': {}
            
        },
        'windows_updates_cost': {},
        'terminology': {}
        
    },
    'getting_windows_10': {
        'hardware_compatability': {
            'hardware_minimum_requirements': {},
        },
        'installation_process': {
            'installation_requirements': {},
            'system_upgrade': {
                'upgrade_status': {},
            },
            'upgrading_incompatable_devices': {},
            'easy_upgrade': {},
            'dvd_upgrade': {},
            'monitor_install_progress':{},
            'clean_install': {}
        },
        'installation_errors': {
            ''
        }
    },
    'using_windows_10': {
        'software_updates': {
            'perform_software_update':{},
            'software_updates_frequency': {},
            'driver_updates': {},
        },
        'changing_settings': {
            'wifi_settings':{},
            'bluetooth_settings':{},
            'cortana_settings':{},
            'theme_settings':{},
            'dock_settings': {},

        },
        'navigation': {
            'accessing_currently_open_apps': {},
            'installing_apps': {},
            'desktop_features': {
                'desktop_shortcuts': {}
            }
        },
        'get_system_info': {
            'get_current_build': {},
            
        },
        'dual_booting': {},
        'program_compatability': {
        },
        
    },
    'windows_phone': {},
    'out_out_scope': {
        'windows_10_launch': {},
        'reserve_windows_10': {},
        'computer_purchases': {}
    }
}

In [103]:
# Print Current
i += 1
QA.loc[i]

question                                                                          Microsoft says Windows 10 Pro installations can be upgraded to Windows 10 Enterprise E3 or E5 within 2 minutes. How is this done?
answer      A license is assigned to a user through Azure Active Directory. When the user signs in, the upgrade is performed on the fly. This can also be done presently for standard Windows 10 Pro and Enterpr...
intent                                                                                                                                                                                                          NaN
Name: 131, dtype: object

In [102]:
# Set current
QA.loc[i, 'intent'] = 'windows_editions' 

In [82]:
# Sanity check of current
print(i)

122


In [80]:
QA.to_csv('../data/scraped_QAs/windows_faq.csv')

In [70]:
QA.loc[i, 'answer']

'Since the introduction of Windows Update as part of Windows, Microsoft has delivered updates as way to keep Windows updated and secure. This has always been the way the company keeps commercial versions of Windows on the market current.  With Windows 10, Microsoft is making significant changes to this model by delivering new methods of servicing Windows. Taking into account we now live in a mobile, constantly connected world, where threats happen all the time. Part of the solution to this   is to keep users current by providing the latest updates\xa0as soon\xa0as they are\xa0available in addition to delivering features as soon as they ready too.  Microsoft will do this using three types of service branches for Windows 10: Service Branch   Options   Edition     Current Branch    Security Updates, Features and Fixes are automatically applied.There is no option to delay or customize these updates.    Windows 10 Home     Current Branch for Business (CBB)    CBB includes the requirements o