In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
def read_file(filename):
    """
    The function reads the input file into a list
    :param filename:
    :return:
    """
    with open(filename, encoding='UTF-8') as f:
        lines = f.read().splitlines()
    return lines

In [3]:
def init_validation(lines):
    """
    Data cleaning function
    :param lines:
    :return:
    """
    list_1 = []

    # Чистка данных
    for line in lines:
        line_new = re.sub(r'\t', ' ', line)
        line_new = re.sub(r'/', '', line_new)
        line_new = re.sub(r"'", '', line_new)
        line_new = line_new.split("--")[0]
        line_new = line_new.strip()
        line_new = re.sub(" +", " ", line_new)
        line_new = line_new.replace('1*', 'DEFAULT')
        line_new = line_new.replace('3*', 'DEFAULT DEFAULT DEFAULT')
        if line_new and line_new.strip():
            list_1.append(line_new)
    return list_1


def validation(lines,
               key_words_for_delete=["WEFAC"],
               key_words=['DATES', 'WEFAC', 'COMPDAT', 'COMPDATL', 'END']):
    """
    Main data cleaning function
    :param lines:
    :param key_words_for_delete:
    :param key_words:
    :return:
    """
    list_1 = init_validation(lines)
    list_data = []
    for index, elem in enumerate(list_1):
        f = re.findall(r'[0-9][0-9]\s[A-Z]+\s[0-9]{4}', elem)
        if f:
            list_data.append(index)
    for number in range(len(list_data) - 1):
        if list_data[number + 1] - list_data[number] == 1:
            list_1.insert(list_data[number + 1], 'DATES')

    init_key_words = key_words
    init_key_words_final = init_key_words[:]
    delete_key_word = key_words_for_delete

    for key_word in delete_key_word:
        init_key_words_final.remove(key_word)

    # Удаление WEFAC
    list_2 = list_1[:]
    indices = [i for i, x in enumerate(list_1) if x in delete_key_word]
    list_for_drop_main = []
    for index in indices:
        list_for_drop = []
        i = index
        while list_1[i] not in init_key_words_final:
            list_for_drop.append(i)
            list_for_drop_main.append(list_for_drop)
            new_list = [item for sublist in list_for_drop_main for item in sublist]
            new_list = sorted(list(set(new_list)))
            i += 1
    for index, elem in enumerate(new_list):
        list_2.pop(elem - index)

    # Проврека ключевого слова END (Слайс до первого слова END) (Конец работы программы)
    list_2 = list_2[:list_2.index('END')]

    # Ищем первое ключевое слово (Начало работы программы)
    start_list = []
    init_key_words_final.remove('END')
    for start in init_key_words_final:
        start_list.append(list_1.index(start))
    start_index = min(start_list)

    # Сепарация списка на сегменты
    list_3 = []
    indices = [i for i, x in enumerate(list_2) if x == 'DATES']
    indices.insert(0, start_index)
    indices.append(len(list_2))
    for index in range(len(indices) - 1):
        list_3.append(list_2[indices[index]:indices[index + 1]])
    return list_3


In [4]:
def make_list_for_dataframe(list_3,
                            key_words_for_delete=["WEFAC"],
                            key_words=['DATES', 'WEFAC', 'COMPDAT', 'COMPDATL', 'END']):
    """
    Function for parsing data after validation
    :param list_3:
    :param key_words_for_delete:
    :param key_words:
    :return:
    """
    final_list = []
    list_words_for_delete = ['DATES', 'END']
    for word in key_words_for_delete:
        list_words_for_delete.append(word)
    key_words_for_parsing = key_words[:]
    for word in list_words_for_delete:
        key_words_for_parsing.remove(word)
    for string in list_3:
        if 'DATES' in string and len(string) == 2:
            final_list.append([string[1]] + [np.nan] * 15)
        if any(element in string for element in key_words_for_parsing):
            if 'DATES' in string:
                indices = [i for i, x in enumerate(string) if x in key_words_for_parsing]
                for ind in indices:
                    i = 1
                    while ind + i not in indices and (ind + i) <= (len(string) - 1):
                        l = string[ind + i].split()
                        if string[ind] == "COMPDAT":
                            l.insert(0, string[1])
                            l.insert(2, np.nan)
                            final_list.append(l)
                        elif string[ind] == "COMPDATL":
                            l.insert(0, string[1])
                            final_list.append(l)
                        i += 1
            if 'DATES' not in string:
                indices = [i for i, x in enumerate(string) if x in key_words_for_parsing]
                for ind in indices:
                    i = 1
                    while ind + i not in indices and (ind + i) <= (len(string) - 1):
                        l = string[ind + i].split()
                        if string[ind] == "COMPDAT":
                            l.insert(0, np.nan)
                            l.insert(2, np.nan)
                            final_list.append(l)
                        elif string[ind] == "COMPDATL":
                            l.insert(0, np.nan)
                            final_list.append(l)
                        i += 1
    return final_list

In [5]:
def make_dataframe(final_list):
    """
    The function creates a dataframe from the input list
    :param final_list:
    :return:
    """
    columns_name = [
        'Date',
        'Well name',
        'Local grid name',
        'I',
        'J',
        'K upper',
        'K lower',
        'Flag on connection',
        'Saturation table',
        'Transmissibility factor',
        'Well bore diameter',
        'Effective Kh',
        'Skin factor',
        'D-factor',
        'Dir_well_penetrates_grid_block',
        'Press_eq_radius'
    ]
    output_df = pd.DataFrame(final_list, columns=columns_name)
    return output_df

In [6]:
def make_parsing(filename, key_words_for_delete=["WEFAC"]):
    """
    File parsing function
    :param filename:
    :param key_words_for_delete:
    :return:
    """
    list_init = read_file(filename)
    list_clean = validation(list_init, key_words_for_delete)
    list_for_df = make_list_for_dataframe(list_clean, key_words_for_delete)
    final_df = make_dataframe(list_for_df)
    return final_df

In [9]:
filename = '/Users/alexander/Desktop/parser/test_schedule.inc'

output_df = make_parsing(filename, key_words_for_delete=["WEFAC"])
output_df.to_excel('output.xlsx')