In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import re
from ast import literal_eval

# to remove limits on display
pd.set_option('display.max_colwidth', -1)

In [3]:
def get_corpus(path='corpus_xavier/*.txt'):
    all_files = glob(path)

    lines = []

    for file in all_files:
        with open(file, 'r', encoding='utf8') as f:
            lines.extend(f.readlines())

    lines = np.array(lines)
    
    lower_ids = [i for i,l in enumerate(lines) if l[0].islower()]

    def consecutive(data, stepsize=1):
        return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

    lower_groups = consecutive(lower_ids)
    
    processed = []

    l_groups = [[g[0]-1, *g] for g in lower_groups]

    flattened_groups = [gg for g in l_groups for gg in g]

    cpt_grp = 0

    i = 0

    pbar = tqdm(total=len(lines))
    while i < len(lines):

        if i not in flattened_groups:
            processed.append(lines[i].strip('\n'))
            pbar.update()
        else:
            group = lower_groups[cpt_grp]
            processed.append(' '.join(list(map(lambda x: x.strip('\n'), [lines[group[0]-1], *lines[group]]))))
            cpt_grp += 1
            pbar.update(group[-1] - i + 1)
            i = group[-1]

        i += 1
    pbar.close()
    print("Preprocessing over")
    
    processed_df = pd.DataFrame(processed, columns=['sentences'])
#     processed_df['sentences'] = processed_df['sentences'].str.replace('\d+', '')
    processed_df = processed_df.replace('', np.nan).dropna()
    
    return processed_df

# processed_df = get_corpus()
# processed_df.to_csv('corpus_xavier/annot_test.csv', index=False, encoding='utf8')

In [4]:
def get_filters(
    folder='context_java_abdaoui',
    neg_only=True,
    ):
    all_regex = []
    
    regex_files = glob(folder + '/regex*.txt')
    
    for file in regex_files:
        with open(file, 'r', encoding="utf8") as f:
            regex = f.readlines()[0]
            try:
                all_regex.extend(literal_eval(regex))
            except:
                all_regex.extend(regex.strip("\"").split('", "'))
                
    formatted_regex = [regex.replace('"', '').replace("'", '').replace(" ,", ',').rsplit(',',2) for regex in all_regex]

    regex_df = pd.DataFrame(formatted_regex, columns=["regex", "scope", "type"])
    
    regex_df.drop_duplicates('regex', inplace=True)
    
    if neg_only:
        regex_df = regex_df[regex_df['type'].isin(['neg'])]
    
    return regex_df['regex']



In [5]:
# Remove negation regex from preprocessed df
def keep_neg_only(df):
    # safe regexs from filters
    # remove 'de' to not keep too much
    filters = get_filters()
    regex_filters = [re.escape(' %s '%(f)) for f in filters.values if f not in ['de', '(de']]

    all_filters_str = '|'.join(regex_filters)

    filtered_df = df[df['sentences'].str.contains(all_filters_str)]
    
    return filtered_df

# pp_df = pd.read_csv('corpus_xavier/annot_test.csv')
# filtered_pp_df = keep_neg_only(pp_df)
# filtered_pp_df.to_csv('corpus_xavier/annot_test_filtered.csv', index=False)

In [6]:
filtered_pp_df = pd.read_csv('corpus_xavier/annot_test_filtered.csv').rename(columns={'sentences': 'text'})

In [8]:
#filtered_pp_df.head(100).to_csv('corpus_xavier/annot_test_filtered100.csv', index=False)

In [16]:
# Remove special characters, accents and unnecessary spaces
# remove numbers
#filtered_pp_df['text'] = filtered_pp_df['text'].str.replace('\W', ' ')
# remove accents
filtered_pp_df = filtered_pp_df.apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
# remove double spaces
filtered_pp_df['text'] = filtered_pp_df['text'].apply(lambda x: ' '.join(x.strip(' ').split()))

filtered_pp_df.to_csv('corpus_xavier/annot_test_filtered_noaccent.csv', index=False)