# Task 1: Extract the text from multiple CSV files

Name : Sayeed Anwar
ID   : s384116
Github Link : https://github.com/cas119/HIT137-software-now-cas119

In [1]:
import pandas as pd
import os
import zipfile
import warnings
import re
# Suppress all warnings
warnings.filterwarnings("ignore")

class TextProcessor:
    def __init__(self, zip_path, output_directory, output_file, chunk_size=1024*1024):
        self.zip_path = zip_path
        self.output_directory = output_directory
        self.output_file = output_file
        self.chunk_size = chunk_size
        self.output_path = os.path.join(output_directory, output_file)

    @staticmethod
    def create_output_directory(directory):
        """Create the output directory if it does not exist."""
        os.makedirs(directory, exist_ok=True)

    def extract_csv_from_zip(self):
        """Extract CSV files from the zip archive and return a list of DataFrames."""
        dfs = []
        with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                if file_info.filename.endswith('.csv'):
                    with zip_ref.open(file_info.filename) as file:
                        df = pd.read_csv(file)
                        dfs.append(df)
        return dfs

    @staticmethod
    def extract_and_clean_texts(dfs):
        """Extract and clean texts from a list of DataFrames and return a generator of cleaned texts."""
        for df in dfs:
            if 'TEXT' in df.columns:
                for text in df['TEXT'].dropna():
                    yield TextProcessor.clean_text(text)

    @staticmethod
    def clean_text(text):
        """Clean text by removing non-English characters and extra spaces."""
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text.strip()

    def process_texts(self):
        """Process and clean texts from CSV files and save the combined cleaned text."""
        self.create_output_directory(self.output_directory)
        dfs = self.extract_csv_from_zip()
        
        with open(self.output_path, 'w', encoding='utf-8') as outfile:
            for cleaned_text in self.extract_and_clean_texts(dfs):
                outfile.write(cleaned_text + '\n')


zip_path = './CSV.zip'
output_directory = './output'
output_file = 'combined_texts.txt'

processor = TextProcessor(zip_path, output_directory, output_file)
processor.process_texts()


# Task 2: Install the necessary libraries

In [2]:
print("Installing...")
# !pip install spacy
# !pip install scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
# !pip install transformers
print("Done.")

Installing...
Done.
