In [3]:
import os
import re
import string


def read_file(dirpath, filename):
    """
    Read the RFC text file and return corresponding lines.
    """
    with open(os.path.join(dirpath, filename), 'r') as f:
        lines = [line.rstrip() for line in f]
    return lines


def process_lines(lines, title, date):
    """
    Given the lines from the raw RFC document, concat the lines 
    forming the same paragraph.
    """
    # Remove all empty lines.
    #lines = [line for line in lines if line]
    
    # Remove all "head" lines in above each page.
    lines = [line for line in lines if not bool(re.search("\[Page.*\]", line))]
    lines = [line for line in lines if not bool(re.search("RFC.*" + date, line))]
    
    # Remove lines with too many special characters.
    lines = [line for line in lines if len(re.sub('[A-Za-z0-9\s]+', '', line))/max(len(line) - line.count(' '), 1) < 0.3]
    
    # Remove lines with too many spaces.
    lines = [line for line in lines if line.count(' ')/max(len(line), 1) < 0.5]
    
    # Remove xml lines.
    r = re.compile('[\s]+<.*?>[\s]+')
    lines = [line for line in lines if r.match(line) is None]
    
    return lines
    
    # Create paragraphs from lines.
    new_lines = []
    section_name = ''
    while lines:

        # Pop the current line.
        line = lines.pop(0)
        
        # If line is empty, skip it.
        if not line: continue

        # Get the number of whitespaces at the beginning of the line.
        line_whitespaces = len(line) - len(line.lstrip(' '))

        # If there are no whitespaces at the beginning of current line...
        if line_whitespaces == 0:
            # Then, the current line is the name of a section.
            section_name = line + '. '  # store this name.
        else:
            # Otherwise, the current line is a line from a paragraph in that section.
            paragraph = section_name + line # append the section name at the start of the paragraph.

            counter_whitespaces = line_whitespaces  # get the current indentation.
            while (
                    lines and                                                             # while there are still lines to read
                    len(lines[0]) - len(lines[0].lstrip(' ')) == counter_whitespaces      # and the next line has the same indentation than previous one
                  ):
                paragraph += lines.pop(0)  # add new line to paragraph.

            new_lines.append(paragraph)
            
    # Remove all multiple spaces.
    new_lines = [re.sub('\s{2,}', ' ', line) for line in new_lines]
    
    # If line begins with a number, remove it.
    new_lines = [line.split(maxsplit=1)[1] if (len(line.split(maxsplit=1))>1 and line.split(maxsplit=1)[0][0].isdigit()) else line for line in new_lines]
    
    # Remove lines from Table of contents.
    new_lines = [line for line in new_lines if not line.startswith("Table of Contents.")]
    
    # Add title of RFC to beginning of each chunk.
    new_lines = [title + ' - ' + line for line in new_lines]
    
    return new_lines


In [30]:
dirpath = '/raid/antoloui/Master-thesis/_data/search/rfc/raw_files'
filename = 'rfc5707.txt'
title = "Media Server Markup Language (MSML)"
date = 'February 2010'

lines = read_file(dirpath, filename)
lines = process_lines(lines, title, date)
#lines

In [24]:
lines = lines[:100]
for line in lines: print(line)









                  Media Server Markup Language (MSML)

Abstract

   The Media Server Markup Language (MSML) is used to control and invoke
   many different types of services on IP media servers.  The MSML
   control interface was initially driven by RadiSys with subsequent
   significant contributions from Intel, Dialogic, and others in the
   industry.  Clients can use it to define how multimedia sessions
   interact on a media server and to apply services to individuals or
   groups of users.  MSML can be used, for example, to control media
   server conferencing features such as video layout and audio mixing,
   create sidebar conferences or personal mixes, and set the properties
   of media streams.  As well, clients can use MSML to define media
   processing dialogs, which may be used as parts of application
   interactions with users or conferences.  Transformation of media
   streams to and from users or conferences as well as interactive voice
   response (IVR) dialogs 

In [31]:
# Create paragraphs from lines.
new_lines = []
section_name = []
section_whitespaces = -1
section_whitespaces = [0]
chunk = ''

while lines:

    # Pop the current line.
    line = lines.pop(0)
    print("\nLine: {}".format(line))
    
    # If line is empty, skip it.
    if not line: continue

    # Get the number of whitespaces at the beginning of the line.
    line_whitespaces = len(line) - len(line.lstrip(' '))
    print("Whitespaces: {}".format(line_whitespaces))

    # If the number of whitespaces at the beginning of the current line is bigger than the up-to-date number of whitespaces of the current (sub)section...
    if line_whitespaces > section_whitespaces:
        # It means that the previous chunk was a subsection.
        section_name.append(chunk)  # append previous chunk as a subsection to the current section names list.
        section_whitespaces = line_whitespaces  # update the number of whitespaces for the current subsection.
        print("Current line has more whitespaces.")
        print("Sections: {}".format(str(section_name)))
        print("Updated whitespaces: {}".format(section_whitespaces))
    
    # If the number of whitespaces at the beginning of the current line is exactly equal to the up-to-date number of whitespaces of the current (sub)section...
    elif line_whitespaces == section_whitespaces:
        # It means that the previous chunk was not a new subsection but well a paragraph of the current section.
        paragraph = ' - '.join(section_name) + '. ' + chunk  # create new paragraph as the concatenation of the section and subsections names + the previous chunk.
        new_lines.append(paragraph)  # append it to new_lines.
        print("Current line has same number whitespaces.")
        print("Appending previous chunk: {}".format(paragraph))
    
    # If the number of whitespaces at the beginning of the current line is lower than the up-to-date number of whitespaces of the current (sub)section...
    else:
        # It means that we moved out of the current subsection.
        paragraph = ' - '.join(section_name) + '. ' + chunk  # create new paragraph as the concatenation of the section and subsections names + the previous chunk.
        new_lines.append(paragraph)  # append it to new_lines.
        section_name.pop(-1)  # remove the last encountered subsection from the section names list.
        section_whitespaces = line_whitespaces  # update the number of whitespaces for the current subsection.
        print("Current line has less whitespaces.")
        print("Appending previous chunk: {}".format(paragraph))
        print("Updated sections: {}".format(section_name))
        print("Updated whitespaces: {}".format(section_whitespaces))
        
    # Get the next chunk.
    chunk = line
    while lines and lines[0]:
        chunk += lines.pop(0)  # add new line to paragraph.
       
# Remove all multiple spaces.
new_lines = [re.sub('\s{2,}', ' ', line) for line in new_lines]

# If line begins with a number, remove it.
new_lines = [line.split(maxsplit=1)[1] if (len(line.split(maxsplit=1))>1 and line.split(maxsplit=1)[0][0].isdigit()) else line for line in new_lines]

# Remove lines from Table of contents.
new_lines = [line for line in new_lines if not line.startswith("Table of Contents.")]


Line: 

Line: 

Line: 

Line: 

Line: 

Line: 

Line: 

Line: 

Line:                   Media Server Markup Language (MSML)
Whitespaces: 18
Current line has more whitespaces.
Sections: ['']
Updated whitespaces: 18

Line: 

Line: Abstract
Whitespaces: 0
Current line has less whitespaces.
Appending previous chunk: .                   Media Server Markup Language (MSML)
Updated sections: []
Updated whitespaces: 0

Line: 

Line:    The Media Server Markup Language (MSML) is used to control and invoke
Whitespaces: 3
Current line has more whitespaces.
Sections: ['Abstract']
Updated whitespaces: 3

Line: 

Line: Status of This Memo
Whitespaces: 0
Current line has less whitespaces.
Appending previous chunk: Abstract.    The Media Server Markup Language (MSML) is used to control and invoke   many different types of services on IP media servers.  The MSML   control interface was initially driven by RadiSys with subsequent   significant contributions from Intel, Dialogic, and others in the   ind

In [32]:
new_lines

['. Media Server Markup Language (MSML)',
 'Abstract. The Media Server Markup Language (MSML) is used to control and invoke many different types of services on IP media servers. The MSML control interface was initially driven by RadiSys with subsequent significant contributions from Intel, Dialogic, and others in the industry. Clients can use it to define how multimedia sessions interact on a media server and to apply services to individuals or groups of users. MSML can be used, for example, to control media server conferencing features such as video layout and audio mixing, create sidebar conferences or personal mixes, and set the properties of media streams. As well, clients can use MSML to define media processing dialogs, which may be used as parts of application interactions with users or conferences. Transformation of media streams to and from users or conferences as well as interactive voice response (IVR) dialogs are examples of such interactions, which are specified using MSML.