# Use functions for everything

In [1]:
from requests_html import HTMLSession
import pandas as pd

In [10]:
def scrape_month(programme_id, semester_id, month_id):
    payload = {
        'programme' : programme_id,
        'semester' : semester_id,
        'month' : month_id
    }
    s = session.post(url, data=payload)
    if s.status_code == 200: # A OK
        try:
            scraped_table = pd.read_html(s.html.html, header=1)[0] # Read the table into Pandas
        except ValueError:
            raise
        # Perform some formatting to get it into order
        scraped_table.drop(columns=scraped_table.columns.values.tolist()[-11:], inplace=True)
        scraped_table['Month'] = months[payload['month']]
        scraped_table['Semester'] = semesters[payload['semester']]
        # Identify papers and name columns accordingly
        papers = [
            paper.text 
            for paper in s.html.find(selector='th') 
            if 'colspan' in paper.attrs and paper.attrs['colspan'] == '6'
        ]
        types = ['Lectures_Held', 'Lectures_Attended',
                 'Tutorials_Held','Tutorials_Attended',
                 'Practicals_Held', 'Practicals_Attended',]
        headers = ['Name']
        for paper in papers:
            for attendance_type in types:
                headers.append(paper+'_'+attendance_type)     
        headers.append('Month') 
        headers.append('Semester')
        col_names = scraped_table.columns.values.tolist()
        col_rename = dict(zip(col_names,headers))
        scraped_table.rename(mapper=col_rename, axis=1,inplace=True)
        return scraped_table
    else:
        return None
    
def tidy_scraped(scraped_table):
    scraped_melted = pd.melt(scraped_table,id_vars=['Name', 'Month', 'Semester'], var_name='Paper_Attendance')
    paper_types = scraped_melted['Paper_Attendance'].str.split('_')
    paper_names = paper_types.str.get(0)
    class_type = paper_types.str.get(1)
    attendance_type = paper_types.str.get(2)
    scraped_melted['Paper Name'] = paper_names
    scraped_melted['Class Type'] = class_type
    scraped_melted['Attendance Type'] = attendance_type
    scraped_melted.drop('Paper_Attendance', axis =1, inplace=True)
    attend_tidy = scraped_melted.pivot_table(
        index=['Name', 'Semester','Paper Name', 'Month', 'Class Type'],
        columns='Attendance Type',
        values='value'
    )
    return attend_tidy


In [9]:
url = 'http://apf.ststephens.edu/attendance_stku2018'
get_semester_url = 'http://apf.ststephens.edu/getsemesterbypro?programme_id={}'
get_month_url = 'http://apf.ststephens.edu/getmonthbysemepro?semester_id={}&programme_id={}'
session = HTMLSession()

In [12]:
months = {
     '07': 'July',
     '08': 'August',
     '09': 'September',
     '10': 'October',
     '11': 'November'}

semesters = {'8': 'Semester I', '9': 'Semester II'}

In [13]:
df = tidy_scraped(scrape_month('6','8','07'))

In [15]:
df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Attendance Type,Attended,Held
Name,Semester,Paper Name,Month,Class Type,Unnamed: 5_level_1,Unnamed: 6_level_1
Aarzoo Jolly,Semester I,Delhi Thought the Ages,July,Lectures,,
Aarzoo Jolly,Semester I,Delhi Thought the Ages,July,Practicals,,
Aarzoo Jolly,Semester I,Delhi Thought the Ages,July,Tutorials,,
Aarzoo Jolly,Semester I,"English Communications AECC IPhil, I Sans , I Engl",July,Lectures,6.0,6.0
Aarzoo Jolly,Semester I,"English Communications AECC IPhil, I Sans , I Engl",July,Practicals,0.0,0.0
Aarzoo Jolly,Semester I,"English Communications AECC IPhil, I Sans , I Engl",July,Tutorials,0.0,0.0
Aarzoo Jolly,Semester I,Ethics in the Public Domain,July,Lectures,,
Aarzoo Jolly,Semester I,Ethics in the Public Domain,July,Practicals,,
Aarzoo Jolly,Semester I,Ethics in the Public Domain,July,Tutorials,,
Aarzoo Jolly,Semester I,European Classical Literature,July,Lectures,6.0,6.0


In [19]:
a = session.get(url)
pd.read_html(a.html.html)

ValueError: No tables found