<a href="https://colab.research.google.com/github/bhattacharjee/scaling-giggle/blob/main/parse_electoral_roll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Installing the dependencies first

We rely on two packages mainly, pdf2image and pytesseract

In [1]:
# Install the dependencies
!pip install pdf2image
!pip install pytesseract
!pip install wget

!apt-get install poppler-utils
!apt-get install libleptonica-dev 
!apt-get install tesseract-ocr tesseract-ocr-dev
!apt-get install libtesseract-dev
!apt-get install tesseract-ocr
!apt-get install tesseract-ocr-eng

Collecting pdf2image
  Downloading pdf2image-1.16.0-py3-none-any.whl (10 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.0
Collecting pytesseract
  Downloading pytesseract-0.3.9-py2.py3-none-any.whl (14 kB)
Collecting Pillow>=8.0.0
  Downloading Pillow-9.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 5.1 MB/s 
Installing collected packages: Pillow, pytesseract
  Attempting uninstall: Pillow
    Found existing installation: Pillow 7.1.2
    Uninstalling Pillow-7.1.2:
      Successfully uninstalled Pillow-7.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed Pillow-9.0.1 pytesseract-0.3.9


Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=3d51fe766bf2cc8b83ea25f35e1e76c4cdf80bfba502bb063978d55666b246b2
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 154 kB of archives.
After this operation, 613 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-u

In [2]:
!apt-get install tesseract-ocr-eng

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr-eng is already the newest version (4.00~git24-0e00fe6-1.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [1]:
import os
import re
import wget
import json
import tempfile
import logging
import pdf2image
import pytesseract

import numpy as np
import pandas as pd

from functools import lru_cache

In [2]:
# Add the links to the PDFs here:

ONLINE_PDF_FILES_LIST = [
    "http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010001.pdf",
    "http://ceomeghalaya.nic.in/erolls/pdf/english/A001/A0010002.pdf"
]

In [None]:
 STATE_ZERO = 0
 STATE_READING_NAMES = 1
 STATE_READING_OTHERS_NAME = 2
 STATE_READING_AGE_GENDER = 3

 class Roll:
    def __init__(self, url:str)->list:
        """Construct the object which will be used for further
        processing

        Parameters:
        url (str): The URL to the PDF (should not be a redirect)
        """

        self.temp_file_name = None
        self.pdf_url = url
        self.state = STATE_ZERO
        self.pages = None
        self.pages_text = list()
        self.voters = list()

        self.town = "UNKNOWN"
        self.block = "UNKNOWN"
        self.post_office = "UNKNOWN"
        self.police_station = "UNKNOWN"
        self.pin_code = "000000"

        self.re_other_name = re.compile(\
            "((other.?s|father.?s|mother.?s|husband.?s)\s?name\s*[=:>-])",\
            re.IGNORECASE)
        self.re_other_name_for_match = re.compile(\
            "((other.?s|father.?s|mother.?s|husband.?s)\s?name\s*[=:>-])",\
            re.IGNORECASE)

        self.re_name = re.compile("(name\s*[=:>-])", re.IGNORECASE)
        self.re_name_for_match = re.compile(".*(name\s*[=:>-])", re.IGNORECASE)

        self.re_house_num = re.compile(
            "(House\s*number\s*[:=>-]\s*)", re.IGNORECASE)
        self.re_house_num_for_match = re.compile(
            ".*(House\s*number\s*[:=>-]\s*)", re.IGNORECASE)

        self.re_age_gender = re.compile(
            "(age\s*[:=>-]\s*(\d+)\s*gender\s*[:=>-]\s*(male|female))",
            re.IGNORECASE
        )
        self.re_age_gender_for_match = re.compile(
            ".*(age\s*[:=>-]\s*(\d+)\s*gender\s*[:=>-]\s*(male|female))",
            re.IGNORECASE
        )

        self.re_age = re.compile("(age\s*[:=>-]\s*(\d*))", re.IGNORECASE)
        self.re_age_for_match = re.compile(\
                ".*(age\s*[:=>-]\s*(\d*))", re.IGNORECASE
        )

        self.page_details = {}

        temp_file = tempfile.NamedTemporaryFile(delete=False)
        self.temp_file_name = f"{temp_file.name}.pdf"
        temp_file.close()
    

    @lru_cache(maxsize=256)
    def get_text_as_list(self, text):
        text = [s.strip() for s in text.split('\n')]
        text = [s for s in text if len(s) > 0]
        names = list()
        gender = list()
        other = list()
        return text

    def download(self)->None:
        """Download the PDF file for this object

        Returns:
        None
        """
        wget.download(self.pdf_url, self.temp_file_name)
        if not os.path.isfile(self.temp_file_name) or \
            0 == os.stat(self.temp_file_name).st_size:
            raise Exception("Failed to download file")

    def parse_first_page(self):
        """
        First page contains a lot of details, parse them
        to fill the details of the geolocation of electoral roll
        """
        re_town_village = re.compile(".*town.*village\s*[=:]\s*(.*)", \
                                    re.IGNORECASE)
        re_post_office = re.compile(".*Post.*Office\s*[=:]\s*(.*)", \
                                    re.IGNORECASE)
        re_pin_code = re.compile(".*pin.*code.*\s*([0-9]{6})\s*",
                                    re.IGNORECASE)
        re_block = re.compile(".*block\s[=:]\s*(.*)", re.IGNORECASE)
        re_district = re.compile(".*district\s:\s*(.*)", re.IGNORECASE)
        re_police_st = re.compile(".*police.*station\s*[=:]\s*(.*)",\
                                    re.IGNORECASE)
        text = self.get_text_as_list(self.pages_text[0])
        for s in text:
            m = re_town_village.match(s)
            if m:
                self.town = m.group(1).strip()
                continue
            m = re_post_office.match(s)
            if m:
                self.post_office = m.group(1).strip()
                continue
            m = re_pin_code.match(s)
            if m:
                self.pin_code = m.group(1).strip()
                continue
            m = re_block.match(s)
            if m:
                self.block = m.group(1).strip()
                continue
            m = re_district.match(s)
            if m:
                self.district = m.group(1).strip()
                continue
            m = re_police_st.match(s)
            if m:
                self.police_station = m.group(1).strip()
                continue
        #print(f"{self.town}, {self.post_office}, {self.block}, {self.police_station}, {self.district}, {self.pin_code}")
    
    def convert_to_text(self, i):
        """
        Convert an image to text using pytesseract.
        Pages from the PDF have already been converted to images
        and stored in a dictionary indexed by page number
        """
        logging.debug(f"converting to text - page {i}")
        print(f"converting to text - page {i}")
        s = pytesseract.image_to_string(self.pages[i])
        s = s.replace("Age:", "\r\nAge:")
        s = s.replace("Photo is", "\r\nPhoto is")
        #s = s.replace("|" , "\r\n")
        #s = s.replace("[", "\r\n")
        #s = s.replace("]", "\r\n")
        return s

    def get_other_name(self, s):
        """
        There can be several names in a single line as follows:
        Fathers's Name: LAMJINGKMEN KHONGBUH Fathers' Name = LEM! CHALLAM Father's Name = PRECIOUSLY RYNGKHLEM
        These need to be split and returned as a list
        """
        matches = self.re_other_name.findall(s)
        for a, b in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names
    
    def get_name(self, s):
        """
        Do the same things for namess other's names
        """
        matches = self.re_name.findall(s)
        for a in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names

    def get_house_num(self, s):
        """
        Do the same thing for house number
        """
        matches = self.re_house_num.findall(s)
        for a in matches:
            s = s.replace(a, "|")
        names = [x.strip() for x in s.split("|")]
        names = [x for x in names if len(x) > 0]
        return names

    def get_age_gender(self, s):
        """
        Do the same thing for age and gender.
        Age and gender appear in the same line.

        This funciton matches lines that contain both age and gender
        
        There may be cases where lines contain only
        age or only gender

        Those are handled by get_age_only, and get_gender_only
        """
        matches = self.re_age_gender.findall(s)
        ages = list()
        genders = list()
        for _, age, gender in matches:
            ages.append(age)
            genders.append(gender)
        return ages, genders

    def get_age_only(self, s):
        """
        Do the same for age. Match lines that contain only age but not gender
        """
        matches = self.re_age.findall(s)
        ages = list()
        print(matches)
        assert(False)
        return []

    def get_temp_file_name(self):
        temp_file = tempfile.NamedTemporaryFile()
        temp_file_name = f"{temp_file.name}.json"
        temp_file.close()
        return temp_file_name


    def parse_roll_page(self, pagenum:int)->dict:
        page_other_names = []
        page_names = []
        page_house_numbers = []
        page_genders = []
        page_ages = []

        if not pagenum in self.pages_text:
            raise Exception("page not found")
        text = self.get_text_as_list(self.pages_text[pagenum])
        for s in text:
            # Must match other name first, becuase
            # the elif condition will also match
            # and we should avoid that
            if self.re_other_name_for_match.match(s):
                page_other_names += self.get_other_name(s)
            elif self.re_name.match(s):
                page_names += self.get_name(s)
            elif self.re_house_num_for_match.match(s):
                page_house_numbers += self.get_house_num(s)
            elif self.re_age_gender_for_match.match(s):
                ages, genders = self.get_age_gender(s)
                page_ages += ages
                page_genders += genders
            elif self.re_age_for_match.match(s):
                page_ages += self.get_age_only()
            else:
                # print(f"DIDN not match: |{s}|")
                pass


        # Ensure that we have the same number of rows in each column
        check_array = [len(page_other_names), len(page_names)]
        check_array += [len(page_house_numbers), len(page_genders)]
        check_array += [len(page_ages)]
        # print(check_array)
        for i in range(len(check_array) - 1):
            x = check_array[i]
            for y in check_array[i:]:
                assert(min(x, 30) == min(y, 30))

        ret_array = []
        for name, o_name, housenum, gender, age in \
            zip(\
                page_names,\
                page_other_names,\
                page_house_numbers,\
                page_genders,\
                page_ages):
            val = { \
                "name": name,\
                "other_name": o_name,\
                "house_num": housenum,\
                "gender": gender,\
                "age": age,\
                "town": self.town,\
                "block": self.block,\
                "post_office": self.post_office,\
                "police_station": self.police_station,\
                "pin_code": self.pin_code, \
            }
            ret_array.append(val)

        return ret_array
            
        

    def process(self):
        if not os.path.isfile(self.temp_file_name) or \
            0 == os.stat(self.temp_file_name).st_size:
            raise Exception("Failed to download file")
        self.pages = pdf2image.convert_from_path(self.temp_file_name)
        #self.pages = self.pages[:10] + [self.pages[-1]]
        self.pages_text = \
            {i: self.convert_to_text(i) for i in range(len(self.pages))}
        self.parse_first_page()
        for i in range(3, len(self.pages) - 1):
            print(f"Parsing page: {i}")
            self.voters += self.parse_roll_page(i)
        temp_filename = self.get_temp_file_name() 
        with open(temp_filename, "w") as f:
            json.dump(self.voters, f)
        df = pd.read_json(temp_filename, orient="records")
        return df
        

    def __del__(self):
        if os.path.exists(self.temp_file_name):
            os.unlink(self.temp_file_name)


roll = Roll(ONLINE_PDF_FILES_LIST[0])

roll.download()
df = roll.process()
print(df.describe())
print(0)
print(df)


In [None]:
"""
def get_text_as_list(text):
    text = [s.strip() for s in text.split('\n')]
    text = [s for s in text if len(s) > 0]
    names = list()
    gender = list()
    other = list()
    
    for i, s in enumerate(text):
        print(f"{i:>3d} : [{s}]")
    
    return text

get_text_as_list(roll.pages_text[15])
"""