In [16]:
import sys
import pandas as pd
import logging

In [17]:
# set up logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create an anagram test list

The following steps will generate a file `anagram.csv` with two columns, with each row containing a pair of strings to be checked in AWS Lambda. The file structure is:

|  | word_1 | word_2 |
| --- | --- | --- |
| 0 | acer | acre |


Steps:
1. parse HTML from a [website](http://itools.subhashbose.com/wordfind/common-anagrams/) showing a list of words; each word has at least one anagram
3. create a data frame with two columns of random words from the data parsed above
4. store the data frame as a csv



In [18]:
# Step 1: parse HTML data
# contains a list of dataframes with parsed data; need to check which one contains the word table
anagram_list = pd.read_html("http://itools.subhashbose.com/wordfind/common-anagrams/")

logger.info('Fetched word list')

INFO:root:Fetched word list


In [19]:
anagram_list

[                                                   0
 0  create_bm(); WordFind (adsbygoogle = window.ad...,
            0       1        2        3       4         5         6
 0       acer   acers     aces    aches    acme     acned      acre
 1      acres     act     acts     acyl     add      adds    adverb
 2       aesc  aether  aethers   afield     aft     agree    agrees
 3        ags      ah      ahs     aide   aides   airings    airmen
 4      alloy   almes      alp     alps  altern  altrices      alts
 ..       ...     ...      ...      ...     ...       ...       ...
 215     wets    what     when  whereat     who    whores      wigs
 216    wings   wipes     wolf   wolves     won      wons   wordier
 217  worried     wot     wrap    wraps   wrast   wreathe  wreathes
 218      yah   yaird      yap      yaw     yea     zendo     zoned
 219      NaN     NaN      NaN      NaN     NaN       NaN       NaN
 
 [220 rows x 7 columns]]

In [20]:
# output contains two lists, the data seems to be stored in the second list
len(anagram_list)

2

In [21]:
anagram_list[1]

Unnamed: 0,0,1,2,3,4,5,6
0,acer,acers,aces,aches,acme,acned,acre
1,acres,act,acts,acyl,add,adds,adverb
2,aesc,aether,aethers,afield,aft,agree,agrees
3,ags,ah,ahs,aide,aides,airings,airmen
4,alloy,almes,alp,alps,altern,altrices,alts
...,...,...,...,...,...,...,...
215,wets,what,when,whereat,who,whores,wigs
216,wings,wipes,wolf,wolves,won,wons,wordier
217,worried,wot,wrap,wraps,wrast,wreathe,wreathes
218,yah,yaird,yap,yaw,yea,zendo,zoned


In [22]:
anagram_df = anagram_list[1]

In [23]:
# Step 2: Restructure table into a two-column format which will give us pairs of strings
# a) concat all columns into one series and remove null values
words = pd.concat([anagram_df[col] for col in anagram_df.columns])
words = words.dropna()

words

0          acer
1         acres
2          aesc
3           ags
4         alloy
         ...   
214       wetas
215        wigs
216     wordier
217    wreathes
218       zoned
Length: 1533, dtype: object

In [24]:
# b) create df containing string pairs
string_pair_df = pd.DataFrame({"word_1": words.sample(500).values,
                               "word_2": words.sample(500).values})

In [25]:
string_pair_df

Unnamed: 0,word_1,word_2
0,slivers,kin
1,skied,net
2,adds,posts
3,aft,chin
4,tampons,reest
...,...,...
495,pets,tosh
496,filler,staple
497,dearths,tough
498,thae,ugh


In [26]:
# Step 3: serialize data
string_pair_df.to_csv('./anagrams.csv')

# Check anagram function

In [27]:
def is_anagram(word_1:str, word_2:str) -> bool:
    try:
        word_1 = word_1.lower()
        word_2 = word_2.lower()
    except Exception as e:
        logger.error(f"Error happened: {e}")
        sys.exit()
    
    anagram_test = sorted(word_1) == sorted(word_2)
    
    return anagram_test

In [28]:
# check
string_pair_df.apply(lambda x:is_anagram(x.word_1, x.word_2), axis=1).value_counts()

False    499
True       1
dtype: int64

In [29]:
string_pair_df.loc[string_pair_df.apply(lambda x:is_anagram(x.word_1, x.word_2), axis=1)]

Unnamed: 0,word_1,word_2
494,lats,last
