## Pattern Extraction using Python

In [10]:
import os
import re
import pandas as pd

def extract_numbers_from_files(directory):
    # Regex pattern to match numbers in 3-2-4 format
    pattern = r'\b\d{3}-\d{2}-\d{4}\b'
    
    # List to store the results as tuples (filename, matched_number)
    results = []
    
    # Iterate over each file in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):  # Check if the file is a .txt file
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                file_contents = file.read()
                # Find all matches in the file
                matches = re.findall(pattern, file_contents)
                # Add each match with the filename to the results list
                for match in matches:
                    results.append((filename, match))
    
    return results

def write_to_excel(data, output_file):
    # Convert the data to a DataFrame
    df = pd.DataFrame(data, columns=['Filename', 'Data'])
    # Write the DataFrame to an Excel file
    df.to_excel(output_file,index=False)

# Directory containing the .txt files
directory = r'C:\Users\Ananya\Downloads\TEXT 2'
# Output Excel file path
output_excel =r'D:\Desired Output Assignment\PatternExtraction_output.xlsx'


# Extract numbers and write to Excel
extracted_data_pattern = extract_numbers_from_files(directory)
write_to_excel(extracted_data_pattern, output_excel)

In [5]:
pip install xlsxwriter

Collecting xlsxwriter
  Obtaining dependency information for xlsxwriter from https://files.pythonhosted.org/packages/a7/ea/53d1fe468e63e092cf16e2c18d16f50c29851242f9dd12d6a66e0d7f0d02/XlsxWriter-3.2.0-py3-none-any.whl.metadata
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
   ---------------------------------------- 0.0/159.9 kB ? eta -:--:--
   -- ------------------------------------- 10.2/159.9 kB ? eta -:--:--
   ------- ------------------------------- 30.7/159.9 kB 325.1 kB/s eta 0:00:01
   ------------------- ------------------- 81.9/159.9 kB 657.6 kB/s eta 0:00:01
   ---------------------------------------- 159.9/159.9 kB 1.1 MB/s eta 0:00:00
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0
Note: you may need to restart the kernel to use updated packages.


## Keyword match using Python

In [11]:
import os
import re
import pandas as pd

def extract_terms_from_files(directory, terms):
    # Constructing the regex pattern
    pattern = r'\b(?:' + '|'.join(terms) + r')\b'
    
    # List to store the results as tuples (filename, matched_term)
    results = []
    
    # Iterate over each file in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):  # Check if the file is a .txt file
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                file_contents = file.read()
                # Find all matches in the file
                matches = re.findall(pattern, file_contents,flags=re.IGNORECASE)
                # Add each match with the filename to the results list
                for match in matches:
                    results.append((filename, match))
    
    return results

def write_to_excel(data, output_file):
    # Convert the data to a DataFrame
    df = pd.DataFrame(data, columns=['Filename', 'Matched Term'])
    # Write the DataFrame to an Excel file
    df.to_excel(output_file,index=False)

# Directory containing the .txt files
directory = r'C:\Users\Ananya\Downloads\TEXT 2'
# Output Excel file path
output_excel = r'D:\Desired Output Assignment\KeywordExtraction_output.xlsx'

# List of terms to search for
search_terms = ['Student Id','Student Name','Student Grade','Grade','Roll Number','Roll No','SSN','Social Security Number','Social Security','Social Security Numbers','DOB','Date of Birth','DLN','DL','Driving License','Passport','Credit Card','CC','SIS','SIS ID','SID']

# Extract terms and write to Excel
extracted_data_keyword = extract_terms_from_files(directory, search_terms)
write_to_excel(extracted_data_keyword, output_excel)