In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import re

def remove_subscripts(text):
    # Adjusted regular expression to handle different subscript patterns
    subscript_pattern = r'_{([^}]*)}'
    return re.sub(subscript_pattern, '', text)

url = 'https://www.trussel2.com/PNP/pnp-a.htm'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store dictionary entries
entries = []

# Find all dictionary entries
for entry in soup.find_all('p', class_='hw'):
    # Find the word, part of speech, and translation within each entry
    word = remove_subscripts(entry.find('span', class_='headword').text)
    part_of_speech = entry.find('span', class_='g').text
    translation = entry.find('span', class_='m').text

    # Append the entry to the list
    entries.append({'word': word, 'part_of_speech': part_of_speech, 'translation': translation})

    # Print the information (optional)
    print(f'Word: {word}, Part of Speech: {part_of_speech}, Translation: {translation}')

# Write to CSV
csv_file_path = 'dictionary_data.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['word', 'part_of_speech', 'translation']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write entries
    writer.writerows(entries)

print(f'Data has been successfully written to {csv_file_path}')




Word: ah₁, Part of Speech: n., Translation: name of the letter a, the first letter of the Pohnpeian alphabet, used to represent the phoneme /a/, a  low central vowel
Word: ah₂, Part of Speech: poss. cl., Translation: his, her, hers, its; third person singular form of the general possessive classifier
Word: ah₃, Part of Speech: conj., Translation: however, and (signalling a contrastive relationship between conjoined clauses), then
Word: ah₄, Part of Speech: n., Translation: fish sp., bluespot mullet, Valamugil seheli, at a growth stage of approximately 12 inches
Word: ah₅, Part of Speech: interj., Translation: Oh!; commonly used as an expression of approval
Word: ai, Part of Speech: interj., Translation: No way!
Word: ahi₁, Part of Speech: poss. cl., Translation: my, mine
Word: ahi₂, Part of Speech: n., Translation: fire
Word: ahia, Part of Speech: n., Translation: rainbow
Word: aiau, Part of Speech: n., Translation:  banyan tree, Ficus prolixa and Ficus virens; leaves, fruit, bark, and

In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import re

def filter_ascii(text):
    # Filter characters to ASCII (1-128) range
    filtered_text = ''.join(char for char in text if 1 <= ord(char) <= 128)
    return filtered_text

url = 'https://www.trussel2.com/PNP/pnp-a.htm'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store dictionary entries
entries = []

# Find all dictionary entries
for entry in soup.find_all('p', class_='hw'):
    # Find the word, part of speech, and translation within each entry
    word_element = entry.find('span', class_='headword')
    word = filter_ascii(word_element.get_text(separator=' ')) if word_element else ''

    part_of_speech = filter_ascii(entry.find('span', class_='g').text)
    translation = filter_ascii(entry.find('span', class_='m').text)

    # Append the entry to the list
    entries.append({'word': word, 'part_of_speech': part_of_speech, 'translation': translation})

    # Print the information (optional)
    print(f'Word: {word}, Part of Speech: {part_of_speech}, Translation: {translation}')

# Write to CSV
csv_file_path = 'dictionary_data.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['word', 'part_of_speech', 'translation']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write entries
    writer.writerows(entries)

print(f'Data has been successfully written to {csv_file_path}')



Word: ah, Part of Speech: n., Translation: name of the letter a, the first letter of the Pohnpeian alphabet, used to represent the phoneme /a/, a  low central vowel
Word: ah, Part of Speech: poss. cl., Translation: his, her, hers, its; third person singular form of the general possessive classifier
Word: ah, Part of Speech: conj., Translation: however, and (signalling a contrastive relationship between conjoined clauses), then
Word: ah, Part of Speech: n., Translation: fish sp., bluespot mullet, Valamugil seheli, at a growth stage of approximately 12 inches
Word: ah, Part of Speech: interj., Translation: Oh!; commonly used as an expression of approval
Word: ai, Part of Speech: interj., Translation: No way!
Word: ahi, Part of Speech: poss. cl., Translation: my, mine
Word: ahi, Part of Speech: n., Translation: fire
Word: ahia, Part of Speech: n., Translation: rainbow
Word: aiau, Part of Speech: n., Translation:  banyan tree, Ficus prolixa and Ficus virens; leaves, fruit, bark, and roots 

In [12]:
import requests
from bs4 import BeautifulSoup
import csv
import re

def filter_ascii(text):
    # Filter characters to ASCII (1-128) range
    filtered_text = ''.join(char for char in text if (1 <= ord(char) <= 128) and char != '-')
    return filtered_text

def scrape_and_save_to_csv(url, entries):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all dictionary entries
    for entry in soup.find_all('p', class_='hw'):
        # Find the word, part of speech, and translation within each entry
        word_element = entry.find('span', class_='headword')
        word = filter_ascii(word_element.get_text(separator=' ')) if word_element else ''

        part_of_speech = filter_ascii(entry.find('span', class_='g').text) if entry.find('span', class_='g') else ''

        translation = filter_ascii(entry.find('span', class_='m').text) if entry.find('span', class_='m') else ''

        # Append the entry to the list
        entries.append({'word': word, 'part_of_speech': part_of_speech, 'translation': translation})

        # Print the information (optional)
        # print(f'Word: {word}, Part of Speech: {part_of

# List to store dictionary entries
all_entries = []

# List of URLs to scrape
urls_to_scrape = [
    'https://www.trussel2.com/PNP/pnp-a.htm',
    'https://www.trussel2.com/PNP/pnp-e.htm',
    'https://www.trussel2.com/PNP/pnp-i.htm',
    'https://www.trussel2.com/PNP/pnp-o.htm',
    'https://www.trussel2.com/PNP/pnp-oa.htm',
    'https://www.trussel2.com/PNP/pnp-u.htm',
    'https://www.trussel2.com/PNP/pnp-k.htm',
    'https://www.trussel2.com/PNP/pnp-l.htm',
    'https://www.trussel2.com/PNP/pnp-m.htm',
    'https://www.trussel2.com/PNP/pnp-mw.htm',
    'https://www.trussel2.com/PNP/pnp-n.htm',
    'https://www.trussel2.com/PNP/pnp-ng.htm',
    'https://www.trussel2.com/PNP/pnp-p.htm',
    'https://www.trussel2.com/PNP/pnp-pw.htm',
    'https://www.trussel2.com/PNP/pnp-a.htm',
    'https://www.trussel2.com/PNP/pnp-r.htm',
    'https://www.trussel2.com/PNP/pnp-s.htm',
    'https://www.trussel2.com/PNP/pnp-d.htm',
    'https://www.trussel2.com/PNP/pnp-t.htm',
    'https://www.trussel2.com/PNP/pnp-w.htm',
    # Add more URLs as needed
]

# Scrape data from each URL
for url in urls_to_scrape:
    scrape_and_save_to_csv(url, all_entries)

# Write to CSV
csv_file_path = 'pohnpeian_dictionary2.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['word', 'part_of_speech', 'translation']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write entries
    writer.writerows(all_entries)

print(f'Data from all URLs has been successfully written to {csv_file_path}')


Data from all URLs has been successfully written to pohnpeian_dictionary2.csv


In [14]:
import re

def process_text(input_file, output_file):
    with open(input_file, 'r') as file:
        text = file.read()

    # Remove punctuation using regular expression
    text = re.sub(r'[^\w\s]', '', text)

    # Split the text into tokens
    tokens = text.split()

    with open(output_file, 'w') as file:
        # Write each token on a separate line
        file.write('\n'.join(tokens))

# Replace 'input.txt' and 'output.txt' with your file names
process_text('pon2006_002_GEN_01_read.txt', 'pniGen01.txt')
