<a href="https://colab.research.google.com/github/akamalas5/Capstone/blob/main/navigating_wiki_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re


In [None]:
import requests

def getPages(category='Brakes'):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "cmtitle": "Category:"+category,
        "cmlimit": "200",
        #"mode": "pages",
        "list": "categorymembers",
        "depth": 5,
    #     "cmtype":"page",
        "prop": "links",
        "format": "json",
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    PAGES = DATA['query']['categorymembers']
    
    # traversing subcategories
    for page in PAGES:
        if(type(page) == dict and 'Category:' in page['title']):
           #{'pageid': 54336598, 'ns': 14, 'title': 'Category:Bicycle brakes'} 
           title = page['title'].split(':')[1]    
           subpages = getPages(title)
           PAGES.extend(subpages)
    return PAGES

In [None]:
from collections import defaultdict
from tqdm import tqdm
from bs4 import BeautifulSoup

# traverse pages for given categories
page_list = getPages()
wiki_text = defaultdict(list)

for page in tqdm(page_list):
    pageid = page['pageid']
    wiki_url = 'http://en.wikipedia.org/?curid='+str(pageid)
    #print(url)
    
#     if 'brake' not in wiki_url:
#         continue
    page = requests.get(wiki_url)

    html = BeautifulSoup(page.text)
    
    paragraphs = html.find_all('p')
    
    for para in paragraphs:
#         print(para)
        if(len(para.text)>1):
            wiki_text['wiki_text'].append(para.text)
            
print('Done. Extracting table links..:', len(wiki_text['wiki_text']))
rqt_df = pd.DataFrame(wiki_text)
rqt_df.head(), len(rqt_df)

100%|██████████| 123/123 [00:38<00:00,  3.19it/s]

Done. Extracting table links..: 2877





(                                           wiki_text
 0  A brake is a mechanical device that inhibits m...
 1  Most brakes commonly use friction between two ...
 2  Brakes are generally applied to rotating axles...
 3  Since kinetic energy increases quadratically w...
 4  Almost all wheeled vehicles have a brake of so..., 2877)

In [None]:
# function to preprocess rqt_text
def clean(text):
    
    # removing paragraph numbers
    text = re.sub('[0-9]+.','',str(text))
    
    # removing new line characters
    text = re.sub('\n ','',str(text))
    # text = re.sub('\n',' ',str(text))
    
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))

    # removing any reference to outside text
    text = re.sub("[\(*?[\)\]\[]", "", str(text))
    
    # removing table tags
    text = re.sub('&nbsp;', ' ', str(text))
    text = re.sub('nbsp;', ' ', str(text))
    
    # removing table tags
    text = re.sub('  ', ' ', str(text))
    text = text.strip()
    
    return text



In [None]:

# preprocessing req_text
rqt_df['text_clean'] = rqt_df['wiki_text'].apply(clean)
rqt_df.loc[0, 'text_clean']

'A brake is a mechanical device that inhibits motion by absorbing energy from a moving system. It is used for slowing or stopping a moving vehicle, wheel, axle, or to prevent its motion, most often accomplished by means of friction.'

In [None]:
rqt_df.to_csv('drive/MyDrive/Capstone/data/wiki_brake_all_pages.csv') # -> to be sent for coref resolution

### Coref Resolution

In [None]:
req_df = pd.read_csv('drive/MyDrive/Capstone/data/wiki_brake_all_pages_with_coref_df.csv')

In [None]:
len(req_df)

2877

In [None]:
# reading coref resolved file (in other environment)
#rqt_coref_df = pd.read_excel('../data/wiki_brake_full_coref_df.xlsx')
req_df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'wiki_text', 'text_clean',
       'tex_coref_resolved'],
      dtype='object')

In [None]:
def sentences(text):
    # split sentences and questions
    text = re.split('[.?]', str(text))
    clean_sent = []
    for sent in text:
        clean_sent.append(sent)
    return clean_sent

# sentences - The coref resolved data from offline is used
req_df['sent'] = req_df['tex_coref_resolved'].apply(sentences)

In [None]:
req_df = req_df.dropna().reset_index(drop=True)
req_df= req_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
req_df.head()

Unnamed: 0,wiki_text,text_clean,tex_coref_resolved,sent
0,A brake is a mechanical device that inhibits m...,A brake is a mechanical device that inhibits m...,A brake is a mechanical device that inhibits m...,[A brake is a mechanical device that inhibits ...
1,Most brakes commonly use friction between two ...,Most brakes commonly use friction between two ...,Most brakes commonly use friction between two ...,[Most brakes commonly use friction between two...
2,Brakes are generally applied to rotating axles...,Brakes are generally applied to rotating axles...,Brakes are generally applied to rotating axles...,[Brakes are generally applied to rotating axle...
3,Since kinetic energy increases quadratically w...,Since kinetic energy increases quadratically w...,Since kinetic energy increases quadratically w...,[Since kinetic energy increases quadratically ...
4,Almost all wheeled vehicles have a brake of so...,Almost all wheeled vehicles have a brake of so...,Almost all wheeled vehicles have a brake of so...,[Almost all wheeled vehicles have a brake of s...


In [None]:
# create a dataframe containing sentences
rqt_df2 = pd.DataFrame(columns=['Sent','Len'])

row_list = []

for i in range(len(req_df)):
    for sent in req_df.loc[i,'sent']:
        wordcount = len(sent.split())
        if wordcount>0:
            dict1 = {'Sent':sent.strip(),'Len':wordcount}
            row_list.append(dict1)
    
rqt_df2 = pd.DataFrame(row_list)

In [None]:
rqt_df2.head()

Unnamed: 0,Sent,Len
0,A brake is a mechanical device that inhibits m...,16
1,A brake is used for slowing or stopping a movi...,26
2,Most brakes commonly use friction between two ...,30
3,"For example, regenerative braking converts muc...",19
4,Other methods convert kinetic energy into pote...,18


In [None]:
len(rqt_df2)

9569

In [None]:
rqt_df2.to_csv('drive/MyDrive/Capstone/data/wiki_brake_all_sents_with_coref_df.csv')  