# Process text file scripts

### Import Required Libraries

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Change directory to new data folder
os.chdir("drive/My Drive/w266-project/new_data/final/final")

In [None]:
import re

#assign input and output folder path
input_folder_path = '.'
output_folder_path = 'final'

for root, dirs, files in os.walk(input_folder_path):
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(root, file_name)

            # Process the file content
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # Find the index of the "Questions and Answers" section
            start_index = None
            definitions_index = None
            for i, line in enumerate(lines):
                if line.strip() == 'Questions and Answers':
                    start_index = i + 1
                if line.strip() == 'Definitions':
                    definitions_index = i
                    break

            # Skip lines before the "Questions and Answers" section
            if start_index is not None:
                lines = lines[start_index:]

            # Stop reading after "Definitions"
            if definitions_index is not None:
                lines = lines[:definitions_index]

            # Perform text processing
            processed_lines = []
            for line in lines:
                if not line.strip() or line.startswith('-') or line.startswith('(') or re.search(r'\[\d+\]$', line):
                    continue
                line = line.replace('[1]', '').replace('[2]', '').replace('[3]', '')
                processed_lines.append(line.strip())

            output_text = '\n'.join(processed_lines)

            # Determine the output file path
            relative_path = os.path.relpath(file_path, input_folder_path)
            output_file_path = os.path.join(output_folder_path, relative_path)

            # Create the directory structure for the output file
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            # Write the processed content to a new file, overwriting if it exists
            with open(output_file_path, 'w') as file:
                file.write(output_text)


# Do Similar Analysis for Prepared Remarks & Count Tokens

In [None]:
#Change directory to new data folder
os.chdir("drive/My Drive/w266-project/new_data")

FileNotFoundError: ignored

In [None]:
import re

#assign input and output folder path
input_folder_path = '.'
output_folder_path = 'final-presentation'

for root, dirs, files in os.walk(input_folder_path):
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(root, file_name)

            # Process the file content
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # Find the index of the "Questions and Answers" section
            start_index = None
            definitions_index = None
            for i, line in enumerate(lines):
                if line.strip() == 'Presentation' or line.strip() == 'PRESENTATION' or line.strip() == 'PRESENTATION SUMMARY':
                    start_index = i + 1
                if re.search(r'Questions\s+and\s+Answers', line, re.IGNORECASE):
                    definitions_index = i
                    break


            # Skip lines before the "Presentation" section
            if start_index is not None:
                lines = lines[start_index:]

            # Stop reading after "Question and Answers"
            if definitions_index is not None:
                lines = lines[:definitions_index]

            # Perform text processing
            processed_lines = []
            for line in lines:
                if not line.strip() or line.startswith('=') or line.startswith('-') or line.startswith('(') or re.search(r'\[\d+\]$', line):
                    continue
                line = line.replace('[1]', '').replace('[2]', '').replace('[3]', '')
                processed_lines.append(line.strip())

            output_text = '\n'.join(processed_lines)

            # Determine the output file path
            relative_path = os.path.relpath(file_path, input_folder_path)
            output_file_path = os.path.join(output_folder_path, relative_path)

            # Create the directory structure for the output file
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            # Write the processed content to a new file, overwriting if it exists
            with open(output_file_path, 'w') as file:
                file.write(output_text)


#Compute Token Length and Unique Tokens By Input File

In [None]:
# Compute average # of tokens in each Q&A document w/out subfolders
def getPredData(dataPath):
  predPath = f'dataPath'
  testFiles = [file for file in os.listdir(dataPath)]
  PredData = []
  for file in sorted(testFiles):
    dataset = []
    doc_in = open(f'{dataPath}/{file}', 'r', encoding='utf8')
    doc_lines = [line.strip() for line in doc_in.readlines()]
    for line in doc_lines:
      text_to_paraphrase = line.strip().lower()
      dataset.append(text_to_paraphrase)
    PredData.append('\n'.join(dataset) + '\n')

  return PredData

In [None]:
# Compute average # of tokens in each Q&A document w/ subfolders

def getPredDataSub(dataPath):
    PredData = []
    for root, dirs, files in os.walk(dataPath):
        for file in files:
            filepath = os.path.join(root, file)
            dataset = []
            with open(filepath, 'r', encoding='utf8') as doc_in:
                doc_lines = [line.strip() for line in doc_in.readlines()]
                for line in doc_lines:
                    text_to_paraphrase = line.strip().lower()
                    dataset.append(text_to_paraphrase)
            PredData.append('\n'.join(dataset) + '\n')
    return PredData

In [None]:
predpath = '/content/drive/MyDrive/w266-project/new_data/final/final'
QADataset = getPredDataSub(predpath)

In [None]:
# Compute the average length of each string across each item in the list
average_lengths = [sum(len(string) for string in item.split()) for item in QADataset]

# Compute the overall average length across all items
overall_average_length = sum(average_lengths) / len(average_lengths)

print(overall_average_length)

25607.197802197803


In [None]:
# Compute the same for paraphrased summaries

182

In [None]:
predpath = '/content/drive/MyDrive/w266-project/supert/ref_summaries'
PredData = getPredData(predpath)

In [None]:
# Compute the average length of each string across each item in the list
average_lengths = [sum(len(string) for string in item.split()) for item in PredData]

# Compute the overall average length across all items
overall_average_length = sum(average_lengths) / len(average_lengths)

print(overall_average_length)

3163.598901098901
