<a href="https://colab.research.google.com/github/bhattacharjee/scaling-giggle/blob/main/parse_electoral_roll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Installing the dependencies first

We rely on two packages mainly, pdf2image and pytesseract

In [None]:
# Install the dependencies
!pip install pdf2image
!pip install pytesseract
!pip install wget
!pip install -q "tqdm>=4.36.1"

!apt-get install poppler-utils                      > /dev/null 2>&1
!apt-get install libleptonica-dev                   > /dev/null 2>&1
!apt-get install tesseract-ocr tesseract-ocr-dev    > /dev/null 2>&1
!apt-get install libtesseract-dev                   > /dev/null 2>&1
!apt-get install tesseract-ocr                      > /dev/null 2>&1
!apt-get install tesseract-ocr-eng                  > /dev/null 2>&1
!apt-get install tesseract-ocr-eng                  > /dev/null 2>&1

import os
import re
import wget
import json
import tqdm
import shutil
import tempfile
import logging
import pdf2image
import pytesseract

import numpy as np
import pandas as pd

from functools import lru_cache
from google.colab import drive

In [None]:
 STATE_ZERO = 0
 STATE_READING_NAMES = 1
 STATE_READING_OTHERS_NAME = 2
 STATE_READING_AGE_GENDER = 3

 class Roll:
    def __init__(self, url:str, save_directory:str=None)->list:
        """Construct the object which will be used for further
        processing

        Parameters:
        url (str): The URL to the PDF (should not be a redirect)
        """

        self.temp_file_name = None
        self.pdf_url = url
        self.state = STATE_ZERO
        self.pages = None
        self.pages_text = list()
        self.voters = list()

        self.town = "UNKNOWN"
        self.block = "UNKNOWN"
        self.post_office = "UNKNOWN"
        self.police_station = "UNKNOWN"
        self.pin_code = "000000"
        self.save_filename = None
        self.savedir = save_directory
        self.processed_df = None

        self.errors = ""

        self.save_filename = f"{url.split('/')[-1]}.csv"

        self.re_other_name = re.compile(\
            "((other.?s|father.?s|mother.?s|husband.?s)\s?name\s*[=:>-])",\
            re.IGNORECASE)
        self.re_other_name_for_match = re.compile(\
            "((other.?s|father.?s|mother.?s|husband.?s)\s?name\s*[=:>-])",\
            re.IGNORECASE)

        self.re_name = re.compile("(name\s*[=:>-])", re.IGNORECASE)
        self.re_name_for_match = re.compile(".*(name\s*[=:>-])", re.IGNORECASE)

        self.re_house_num = re.compile(
            "(House\s*number\s*[:=>-]\s*)", re.IGNORECASE)
        self.re_house_num_for_match = re.compile(
            ".*(House\s*number\s*[:=>-]\s*)", re.IGNORECASE)

        self.re_age_gender = re.compile(
            "(age\s*[:=>-]\s*(\d+)\s*gender\s*[:=>-]\s*(male|female))",
            re.IGNORECASE
        )
        self.re_age_gender_for_match = re.compile(
            ".*(age\s*[:=>-]\s*(\d+)\s*gender\s*[:=>-]\s*(male|female))",
            re.IGNORECASE
        )

        self.re_age = re.compile("(age\s*[:=>-]\s*(\d*))", re.IGNORECASE)
        self.re_age_for_match = re.compile(\
                ".*(age\s*[:=>-]\s*(\d*))", re.IGNORECASE
        )


        self.page_details = {}

        temp_file = tempfile.NamedTemporaryFile(delete=False)
        self.temp_file_name = f"{temp_file.name}.pdf"
        temp_file.close()
    

    @lru_cache(maxsize=256)
    def get_text_as_list(self, text):
        text = [s.strip() for s in text.split('\n')]
        text = [s for s in text if len(s) > 0]
        names = list()
        gender = list()
        other = list()
        return text

    def download(self)->None:
        """Download the PDF file for this object

        Returns:
        None
        """
        wget.download(self.pdf_url, self.temp_file_name)
        if not os.path.isfile(self.temp_file_name) or \
            0 == os.stat(self.temp_file_name).st_size:
            raise Exception("Failed to download file")

    def parse_first_page(self)->None:
        """
        First page contains a lot of details, parse them
        to fill the details of the geolocation of electoral roll
        """
        re_town_village = re.compile(".*town.*village\s*[=:]\s*(.*)", \
                                    re.IGNORECASE)
        re_post_office = re.compile(".*Post.*Office\s*[=:]\s*(.*)", \
                                    re.IGNORECASE)
        re_pin_code = re.compile(".*pin.*code.*\s*([0-9]{6})\s*",
                                    re.IGNORECASE)
        re_block = re.compile(".*block\s[=:]\s*(.*)", re.IGNORECASE)
        re_district = re.compile(".*district\s:\s*(.*)", re.IGNORECASE)
        re_police_st = re.compile(".*police.*station\s*[=:]\s*(.*)",\
                                    re.IGNORECASE)
        text = self.get_text_as_list(self.pages_text[0])
        for s in text:
            m = re_town_village.match(s)
            if m:
                self.town = m.group(1).strip()
                continue
            m = re_post_office.match(s)
            if m:
                self.post_office = m.group(1).strip()
                continue
            m = re_pin_code.match(s)
            if m:
                self.pin_code = m.group(1).strip()
                continue
            m = re_block.match(s)
            if m:
                self.block = m.group(1).strip()
                continue
            m = re_district.match(s)
            if m:
                self.district = m.group(1).strip()
                continue
            m = re_police_st.match(s)
            if m:
                self.police_station = m.group(1).strip()
                continue
        #print(f"{self.town}, {self.post_office}, {self.block}, {self.police_station}, {self.district}, {self.pin_code}")
    
    def convert_to_text(self, i:int)->None:
        """
        Convert an image to text using pytesseract.
        Pages from the PDF have already been converted to images
        and stored in a dictionary indexed by page number
        """
        s = pytesseract.image_to_string(self.pages[i])
        s = s.replace("Age:", "\r\nAge:")
        s = s.replace("Photo is", "\r\nPhoto is")
        #s = s.replace("|" , "\r\n")
        #s = s.replace("[", "\r\n")
        #s = s.replace("]", "\r\n")
        return s

    def get_other_name(self, s:str)->list:
        """
        There can be several names in a single line as follows:
        Fathers's Name: LAMJINGKMEN KHONGBUH Fathers' Name = LEM! CHALLAM Father's Name = PRECIOUSLY RYNGKHLEM
        These need to be split and returned as a list
        """
        matches = self.re_other_name.findall(s)
        for a, b in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names
    
    def get_name(self, s:str)->list:
        """
        Do the same things for namess other's names
        """
        matches = self.re_name.findall(s)
        for a in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names

    def get_house_num(self, s:str)->list:
        """
        Do the same thing for house number
        """
        matches = self.re_house_num.findall(s)
        for a in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names

    def get_age_gender(self, s:str)->tuple:
        """
        Do the same thing for age and gender.
        Age and gender appear in the same line.

        This funciton matches lines that contain both age and gender
        
        There may be cases where lines contain only
        age or only gender

        Those are handled by get_age_only, and get_gender_only
        """
        matches = self.re_age_gender.findall(s)
        ages = list()
        genders = list()
        for _, age, gender in matches:
            ages.append(age)
            genders.append(gender)
        return ages, genders

    def get_age_only(self, s:str)->list:
        """
        Do the same for age. Match lines that contain only age but not gender
        """
        matches = self.re_age.findall(s)
        ages = list()
        assert(False)
        return []

    def get_temp_file_name(self)->str:
        temp_file = tempfile.NamedTemporaryFile()
        temp_file_name = f"{temp_file.name}.json"
        temp_file.close()
        return temp_file_name


    def parse_roll_page(self, pagenum:int)->dict:
        page_other_names = []
        page_names = []
        page_house_numbers = []
        page_genders = []
        page_ages = []

        if not pagenum in self.pages_text:
            raise Exception("page not found")
        text = self.get_text_as_list(self.pages_text[pagenum])
        for s in text:
            # Must match other name first, becuase
            # the elif condition will also match
            # and we should avoid that
            if self.re_other_name_for_match.match(s):
                page_other_names += self.get_other_name(s)
            elif self.re_name.match(s):
                page_names += self.get_name(s)
            elif self.re_house_num_for_match.match(s):
                page_house_numbers += self.get_house_num(s)
            elif self.re_age_gender_for_match.match(s):
                ages, genders = self.get_age_gender(s)
                page_ages += ages
                page_genders += genders
            elif self.re_age_for_match.match(s):
                page_ages += self.get_age_only()
            else:
                # print(f"DIDN not match: |{s}|")
                pass


        # Ensure that we have the same number of rows in each column
        check_array = [len(page_other_names), len(page_names)]
        check_array += [len(page_house_numbers), len(page_genders)]
        check_array += [len(page_ages)]
        # print(check_array)
        for i in range(len(check_array) - 1):
            x = check_array[i]
            for y in check_array[i:]:
                assert(min(x, 30) == min(y, 30))

        ret_array = []
        for name, o_name, housenum, gender, age in \
            zip(\
                page_names,\
                page_other_names,\
                page_house_numbers,\
                page_genders,\
                page_ages):
            val = { \
                "name": name,\
                "other_name": o_name,\
                "house_num": housenum,\
                "gender": gender,\
                "age": age,\
                "town": self.town,\
                "block": self.block,\
                "post_office": self.post_office,\
                "police_station": self.police_station,\
                "pin_code": self.pin_code, \
            }
            ret_array.append(val)

        return ret_array
            
        

    def process(self)->pd.DataFrame:
        if not os.path.isfile(self.temp_file_name) or \
            0 == os.stat(self.temp_file_name).st_size:
            raise Exception("Failed to download file")
        self.pages = pdf2image.convert_from_path(self.temp_file_name)
        #self.pages = self.pages[:10] + [self.pages[-1]]
        print("Converting pages to text...")
        self.pages_text = \
            {i: self.convert_to_text(i) for i in \
                tqdm.tqdm(range(len(self.pages)))}
        self.parse_first_page()
        print(f"Parsing {len(self.pages)} pages")

        for i in tqdm.tqdm(range(3, len(self.pages) - 1)):
            #print(f"Parsing page: {i}")
            try:
                self.voters += self.parse_roll_page(i)
            except Exception as e:
                print()
                print(f"Error in processing page: {i + 1}")
                print(f"of {self.pdf_url}")
                print(f"Skipped the page, please verify manually")
                self.errors += f"Error in processing url: {self.url}"
                self.errors += f"of {self.pdf_url}. "
                self.errors += f"Skipped the page, please verify manually.\n"

        temp_filename = self.get_temp_file_name() 
        with open(temp_filename, "w") as f:
            json.dump(self.voters, f)
        df = pd.read_json(temp_filename, orient="records")
        self.processed_df = df
        return df
        
    def save(self)->None:
        if self.savedir and "" != self.savedir and self.processed_df:
            save_filename = f"/content/gdrive/{self.savedir}/{self.save_filename}"
            save_filename_err = f"{save_filename}.err"
            try:
                if not os.path.isdir(self.savedir):
                    os.makedirs(\
                                f"/content/gdrive/{self.savedir}",\
                                exist_ok=True)
                if self.errors != "":
                    with open(save_filename_err, w) as ferr:
                        ferr.write(self.errors)
                self.processed_df.to_csv(save_filename)
            except Exception as e:
                print("Failed to save the state")
                print(e)
                

    def __del__(self):
        if os.path.exists(self.temp_file_name):
            os.unlink(self.temp_file_name)


In [None]:
# Add the links to the PDFs here:

ONLINE_PDF_FILES_LIST = [
    "http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf",
    "http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010002.pdf"
]

# Start here

In [None]:
drive.mount("/content/gdrive")
roll = Roll(url="http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf", save_directory="0000__DELETEME____")

roll.download()

df = roll.process()
roll.save()

df.describe().T

Converting pages to text...



  0%|          | 0/39 [00:00<?, ?it/s][A
  3%|▎         | 1/39 [00:03<02:19,  3.67s/it][A
  5%|▌         | 2/39 [00:05<01:30,  2.44s/it][A
  8%|▊         | 3/39 [00:14<03:21,  5.60s/it][A
 10%|█         | 4/39 [00:23<04:06,  7.03s/it][A
 13%|█▎        | 5/39 [00:33<04:36,  8.15s/it][A
 15%|█▌        | 6/39 [00:43<04:39,  8.46s/it][A
 18%|█▊        | 7/39 [00:52<04:37,  8.67s/it][A
 21%|██        | 8/39 [01:02<04:41,  9.07s/it][A
 23%|██▎       | 9/39 [01:11<04:36,  9.23s/it][A
 26%|██▌       | 10/39 [01:21<04:29,  9.30s/it][A
 28%|██▊       | 11/39 [01:30<04:23,  9.41s/it][A
 31%|███       | 12/39 [01:40<04:15,  9.45s/it][A
 33%|███▎      | 13/39 [01:49<04:06,  9.49s/it][A
 36%|███▌      | 14/39 [01:59<03:57,  9.50s/it][A
 38%|███▊      | 15/39 [02:09<03:49,  9.55s/it][A
 41%|████      | 16/39 [02:18<03:39,  9.56s/it][A
 44%|████▎     | 17/39 [02:28<03:30,  9.59s/it][A
 46%|████▌     | 18/39 [02:37<03:16,  9.34s/it][A
 49%|████▊     | 19/39 [02:46<03:07,  9.37s/it]

Parsing 39 pages



100%|██████████| 35/35 [00:00<00:00, 1401.41it/s]


Error in processing page: 13
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually

Error in processing page: 25
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually

Error in processing page: 26
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually

Error in processing page: 29
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually

Error in processing page: 30
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually

Error in processing page: 37
of http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf
Skipped the page, please verify manually
              age  pin_code
count  826.000000     826.0
mean    37.627119  793150.0
std     13.734590       0.0
min     18.000000  793150.0
25%     27.000000  793150.0
50% 




In [None]:
df