In [1]:
import os
import re
import string


def read_file(dirpath, filename):
    """
    Read the RFC text file and return corresponding lines.
    """
    with open(os.path.join(dirpath, filename), 'r') as f:
        lines = [line.rstrip() for line in f]
    return lines


def process_lines(lines, date, max_words_per_paragraph=250):
    """
    Given the lines from the raw RFC document, concat the lines 
    forming the same paragraph.
    """
    # Remove all empty lines.
    lines = [line for line in lines if line]
    
    # Remove all "head" lines in above each page.
    lines = [line for line in lines if not bool(re.search("\[Page.*\]", line))]
    lines = [line for line in lines if not bool(re.search("RFC.*" + date, line))]
    
    # Remove lines with too many special characters.
    lines = [line for line in lines if len(re.sub('[A-Za-z0-9\s]+', '', line))/(len(line) - line.count(' ')) < 0.3]
    
    # Remove lines with too many spaces.
    lines = [line for line in lines if line.count(' ')/len(line) < 0.5]
    
    # Remove xml lines.
    r = re.compile('[\s]+<.*?>[\s]+')
    lines = [line for line in lines if r.match(line) is None]
    
    return lines
    
    
    
    
    
    
    
    
    
    # Remove all multiple spaces.
    new_lines = [re.sub('\s{2,}', ' ', line) for line in new_lines]
    
    # If line begins with a number, remove it.
    new_lines = [line.split(maxsplit=1)[1] if (len(line.split(maxsplit=1))>1 and line.split(maxsplit=1)[0][0].isdigit()) else line for line in new_lines]
    
    return new_lines


In [2]:
dirpath = '/raid/antoloui/Master-thesis/_data/search/rfc/raw_files'
filename = 'rfc5707.txt'
date = 'February 2010'

lines = read_file(dirpath, filename)
lines = process_lines(lines, date)

In [6]:
lines = lines[:60]
for line in lines: print(line)

                  Media Server Markup Language (MSML)
Abstract
   The Media Server Markup Language (MSML) is used to control and invoke
   many different types of services on IP media servers.  The MSML
   control interface was initially driven by RadiSys with subsequent
   significant contributions from Intel, Dialogic, and others in the
   industry.  Clients can use it to define how multimedia sessions
   interact on a media server and to apply services to individuals or
   groups of users.  MSML can be used, for example, to control media
   server conferencing features such as video layout and audio mixing,
   create sidebar conferences or personal mixes, and set the properties
   of media streams.  As well, clients can use MSML to define media
   processing dialogs, which may be used as parts of application
   interactions with users or conferences.  Transformation of media
   streams to and from users or conferences as well as interactive voice
   response (IVR) dialogs are exampl

In [None]:
# Create paragraphs from lines.
new_lines = []
section_name = ''
while lines:

    # Pop the current line.
    line = lines.pop(0)
    print()

    # Get the number of whitespaces at the beginning of the line.
    line_whitespaces = len(line) - len(line.lstrip(' '))

    # If there are no whitespaces at the beginning of current line...
    if line_whitespaces == 0:
        # Then, the current line is the name of a section.
        section_name = line + '. '  # store this name.
    else:
        # Otherwise, the current line is a line from a paragraph in that section.
        paragraph = section_name # append the section name at the start of the paragraph.

        counter_whitespaces = line_whitespaces  # get the current indentation.
        while (
                lines and                                                                 # while there are still lines to read
                len(lines[0]) - len(lines[0].lstrip(' ')) == counter_whitespaces and      # and the next line has the same indentation than previous one
                len(paragraph.split()) + len(lines[0].split()) < max_words_per_paragraph  # and we can still add words without outpassing max words per paragraph
               ):
            paragraph += lines.pop(0)  # add new line to paragraph.

        new_lines.append(paragraph)