### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import itertools as iter
import json
from json import dump

### Read In Master Word Dataframe

In [2]:
words_df = pd.read_csv('letterase-my-word-list.txt', sep="\t")
double_df = words_df[['Word','Start2','End2']].rename(columns = {"Word":"words", "Start2":"start", "End2":"end"})

### Global Variables

In [15]:
# RUNTIME: 2 MINUTES (OR LESS) UNDER CURRENT PARAMETERS

seed = 12

# DOUBLE LETTER RESULTS IN 240,000 SOLUTIONS

p1 = 0.4
p2 = 0.3

### First Merge

In [16]:
# CONDUCT FIRST MERGE AND SAMPLE BASED ON PROBABILITIES

renamed_columns_1 = {"words_x":"w1", "start_x":"start1", "end_x":"end1", "words_y":"w2", "start_y":"start2", "end_y":"end2"}

merge_df_1 = pd.merge(double_df, double_df, left_on = 'end', right_on = 'start').rename(columns = renamed_columns_1)
counts_df_1 = merge_df_1['end1'].value_counts().reset_index().rename(columns={'index': 'value', 0: 'count'})
counts_df_1['p'] = (1/len(counts_df_1)) / counts_df_1['end1']
counts_merge_df_1 = pd.merge(merge_df_1, counts_df_1, left_on = 'end1', right_on = 'value', how='left')
counts_merge_df_1 = counts_merge_df_1[['w1','w2','start2','end2','p']]
first_df = counts_merge_df_1.sample(n=round(len(counts_merge_df_1)*p1), random_state=seed, weights='p')
first_df = first_df[['w1','w2','end2']]

### Second Merge

In [17]:
# CONDUCT SECOND MERGE AND SAMPLE BASED ON PROBABILITIES

renamed_columns_2 = {"words":"w3", "start":"start3", "end":"end3"}

merge_df_2 = pd.merge(first_df, double_df, left_on = 'end2', right_on = 'start').rename(columns = renamed_columns_2)
counts_df_2 = merge_df_2['end2'].value_counts().reset_index().rename(columns={'index': 'value', 0: 'count'})
counts_df_2['p'] = (1/len(counts_df_2)) / counts_df_2['end2']
counts_merge_df_2 = pd.merge(merge_df_2, counts_df_2, left_on = 'end2', right_on = 'value', how='left')
counts_merge_df_2 = counts_merge_df_2[['w1','w2','w3','start3','end3','p']]
second_df = counts_merge_df_2.sample(n=round(len(counts_merge_df_2)*p2), random_state=seed, weights='p')
second_df = second_df[['w1','w2','w3','end3']]

### Third Merge

In [18]:
# CONDUCT THIRD MERGE AND SAMPLE BASED ON PROBABILITIES

renamed_columns_3 = {"words":"w4", "start":"start4", "end":"end4"}

merge_df_3 = pd.merge(second_df, double_df, left_on = 'end3', right_on = 'start').rename(columns = renamed_columns_3)
counts_df_3 = merge_df_3['end3'].value_counts().reset_index().rename(columns={'index': 'value', 0: 'count'})
counts_df_3['p'] = (1/len(counts_df_3)) / counts_df_3['end3']
counts_merge_df_3 = pd.merge(merge_df_3, counts_df_3, left_on = 'end3', right_on = 'value', how='left')
counts_merge_df_3 = counts_merge_df_3[['w1','w2','w3','w4','start4','end4','p']]
third_df = counts_merge_df_3.sample(n=round(len(counts_merge_df_3)*p2), random_state=seed, weights='p')
third_df = third_df[['w1','w2','w3','w4','end4']]

### Final Merge

In [19]:
# CONDUCT FINAL MERGE AND SAMPLE BASED ON PROBABILITIES

renamed_columns_4 = {"words":"w5", "start":"start5", "end":"end5"}

merge_df_4 = pd.merge(third_df, double_df, left_on = 'end4', right_on = 'start').rename(columns = renamed_columns_4)
counts_df_4 = merge_df_4['end4'].value_counts().reset_index().rename(columns={'index': 'value', 0: 'count'})
counts_df_4['p'] = (1/len(counts_df_4)) / counts_df_4['end4']
counts_merge_df_4 = pd.merge(merge_df_4, counts_df_4, left_on = 'end4', right_on = 'value', how='left')
counts_merge_df_4 = counts_merge_df_4[['w1','w2','w3','w4','w5','start5','end5','p']]
final_df = counts_merge_df_4.sample(n=round(len(counts_merge_df_4)*p2), random_state=seed, weights='p')
final_df = final_df[['w1','w2','w3','w4','w5']]

# REMOVE ROWS WITH DUPLICATES

final_df = final_df[~final_df.apply(lambda x: x.duplicated().any(), axis=1)]

# FORMAT FINAL DATAFRAME

renamed_columns_final = {"w1":"words_1", "w2":"words_2", "w3":"words_3", "w4":"words_4", "w5":"words_5"}
final_df = final_df.rename(columns = renamed_columns_final).reset_index(drop=True)


### View Solution List

In [20]:
final_df

Unnamed: 0,words_1,words_2,words_3,words_4,words_5
0,aroma,manga,gamut,utile,legal
1,crepe,perch,cheap,apnea,earth
2,delve,vegan,annex,extra,ratty
3,dogma,mange,genre,rerun,undid
4,outdo,dough,ghost,steep,epoch
...,...,...,...,...,...
109198,grasp,spasm,smash,shoal,along
109199,leach,chafe,fetch,chore,reuse
109200,holly,lymph,photo,touch,check
109201,globe,bench,chasm,smith,third


### Export Solutions

In [21]:
# EXPORT SOLUTIONS

result = final_df.to_json(orient="index")
output_data = 'double_data.json'

with open(output_data, 'w') as f:
    dump(result, f)

### Export Word List

In [22]:
words = pd.read_csv("letterase-my-word-list.txt", sep="\t")['Word'].tolist()
words_upper = [s.upper() for s in words]
json_string = json.dumps(words_upper)

with open("word_list.json", "w") as f:
    json.dump(words_upper, f)