In [3]:
import pandas as pd
import os
import numpy as np
import re
import warnings
import zipfile
import io




In [10]:
# The name of the zip file
zip_file_name = 'archive.zip'

# The name of the CSV file inside the zip
csv_file_name = 'CommonTreeSpecies-USForests.csv'

try:
    # Open the zip archive in read mode
    with zipfile.ZipFile(zip_file_name, 'r') as zf:
        # Open the specific CSV file from within the zip
        with zf.open(csv_file_name) as f:
            
            # Read the file directly into a pandas DataFrame
            # Note: We use read_csv() now
            df = pd.read_csv(f)

except FileNotFoundError:
    print(f"Error: '{zip_file_name}' not found. Make sure it's in the correct directory.")
except KeyError:
    print(f"Error: File '{csv_file_name}' was not found inside the zip.")
    
    
df.sample(5)

Unnamed: 0,TreeGroup,SpeciesName
392,Other Yellow Pines,Virginia Pine (Pinus Virginiana)
358,Other Western Softwoods,Gray Or California Foothill Pine (Pinus Sabini...
198,Other Eastern Hard Hardwoods,Sweet Birch (Betula Lenta)
238,Other Eastern Soft Hardwoods,River Birch (Betula Nigra)
190,Other Eastern Hard Hardwoods,Honeylocust Gleditsia (Gleditsia Triacanthos)


In [20]:
# clean column and create word list of tree names that will be used for the game

trees = df[['SpeciesName']]

# - remove sceintific name from tree name
trees.loc[:, 'SpeciesName'] = trees['SpeciesName'].str.split('(').str[0].str.strip()

# remove trees with 'unknown' in their name
mask = ~trees['SpeciesName'].str.contains('unknown', case=False, na=False)
trees = trees.loc[mask].copy()

# - some tree names have two names, use first
# - replcae dashes with spaces between names
# - remove 'Spp.' from tree names
# - capitalize all letters
trees.loc[:, 'SpeciesName'] = (
    trees['SpeciesName']
    .str.replace('-', ' ')                          
    .str.strip('"')                                
    .str.split(',').str[0]                         
    .str.replace(' Spp.', '', regex=False)         
    .str.strip()
    .str.upper()                                 
)
trees.sample(5)

Unnamed: 0,SpeciesName
281,NORTHERN RED
80,GUMBO LIMBO
354,COULTER PINE
350,BAKER OR MODOC CYPRESS
205,ALLEGHENY CHINKAPIN


In [21]:
# export word list

trees['SpeciesName'].to_csv('wordlist.txt', index=False, header=False)