In [214]:
from bs4 import BeautifulSoup
import requests
import json
import sys
import pandas as pd
import os

In [148]:
cci_root_url = "https://spaces.charlotte.edu/display/ACCI"

In [141]:
def page_available(url: str) -> int:
    return requests.get(url).status_code

In [142]:
def create_soup(url: str) -> str:
    data = requests.get(url).text
    soup = BeautifulSoup(data, 'html.parser')
    return data, soup

In [143]:
if page_available(cci_root_url) == 200:
    data, soup = create_soup(cci_root_url)
else:
    sys.exit()

In [180]:
# root url for appending to qa links since they do not use the same link structure as the base-cci webpage
root_url = 'https://spaces.charlotte.edu'
topics = {}
# loop through each topic on root page
for i in soup.find_all('ul', {'class':'labelList'}):
    # find respective link for each topic
    for j in i.find_all('a'):
        # dict of question answer pairs
        qa = {}
        sub_link = root_url+j.get('href')
        # create soup object for each list of questions page
        sub_data, sub_soup = create_soup(sub_link)
        # loop through each question and store the link, question, and answer
        for idx, list_item in enumerate(sub_soup.find_all('div',{'class':'details'})):
            question_link = root_url + list_item.find('a').get('href')
            question = list_item.find('a').get_text()
            # create soup object for individual questions
            question_data, question_soup = create_soup(question_link)
            # returns answer and keeps the proper html formatting
            answer = question_soup.find('div',{'id':'main-content'}).decode_contents()
            qa[idx] = {
                'question': question,
                'answer': answer,
                'question_link' : question_link
            }
        topics[j.get_text()] = qa

In [181]:
topics.keys()

dict_keys(['advanced-standing', 'application', 'broaderimpacts', 'cci-research-office', 'change', 'committee', 'concentration', 'course-substitution', 'covid-19', 'degreeworks', 'dropbox', 'early-entry', 'fellowships', 'file-list', 'first-semester', 'gpetition', 'grades', 'graduation', 'gre', 'internship', 'ip', 'jobs', 'jr-mentoring', 'late', 'nsf-career', 'paper', 'qualifying-exam', 'register', 'research', 'transfer-credits', 'ug-research', 'z-coming-soon'])

In [188]:
topics['advanced-standing']

{0: {'question': "What's different about transfer credits from Advanced Standing?",
  'answer': '\n<p>With advanced standing,</p><ul><li>You must have a master\'s degree in Computer Science, Software Information Systems, or Computer Engineering.</li><li>Advanced standing applies only to the Computer Science or Software Information Systems Track only.</li><li>You cannot transfer any credits</li><li>However, you only have 42 credits left to go on your degree.\xa0 Which is 30 credits less than the normal 72 credits</li><li>Other than declaring the advanced standing during the application process, no other actions are needed.</li><li>You must complete your qualifying exam after your first year in the program.</li></ul><p>With transfer credits,</p><ul><li>You can transfer up to 30 credits hours</li><li>You must file paperwork\xa0→ see\xa0<a href="/pages/viewpage.action?pageId=41911723">I want to transfer coursework from my MS into my PhD program, what do I do?</a></li></ul>\n',
  'question_

In [216]:
with open(os.path.join(os.getcwd(), 'cci-scrape.json'), 'w') as f:
    json.dump(topics, f, indent=4)

In [200]:
df = pd.DataFrame(columns=['topic','question','answer','link'])

In [195]:
topic = []
question = []
answer = []
link = []

for topic_key in topics.keys():
    for question_key in topics[topic_key].keys():
        question.append(topics[topic_key][question_key]['question'])
        answer.append(topics[topic_key][question_key]['answer'])
        link.append(topics[topic_key][question_key]['question_link'])
        topic.append(topic_key)

In [201]:
df['topic'] = topic
df['question'] = question
df['answer'] = answer
df['link'] = link

In [209]:
df.describe()

Unnamed: 0,topic,question,answer,link
count,75,75,75,75
unique,31,69,69,69
top,early-entry,What's different about transfer credits from A...,"\n<p>With advanced standing,</p><ul><li>You mu...",https://spaces.charlotte.edu/pages/viewpage.ac...
freq,10,2,2,2


In [210]:
df.head()

Unnamed: 0,topic,question,answer,link
0,advanced-standing,What's different about transfer credits from A...,"\n<p>With advanced standing,</p><ul><li>You mu...",https://spaces.charlotte.edu/pages/viewpage.ac...
1,application,I am having a lot of problems with my applicat...,"\n<p>For all application questions, please</p>...",https://spaces.charlotte.edu/pages/viewpage.ac...
2,application,I'm interested in applying to CIS PhD program....,\n<p>Admission is competitive. Preference is g...,https://spaces.charlotte.edu/pages/viewpage.ac...
3,broaderimpacts,What are some activities to consider for broad...,"\n<p>Refer to <a class=""external-link"" href=""h...",https://spaces.charlotte.edu/pages/viewpage.ac...
4,cci-research-office,I need help with my external funding. Who sho...,"\n<div class=""table-wrap""><table class=""relati...",https://spaces.charlotte.edu/pages/viewpage.ac...
