In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
text_as = pd.read_csv('../data/AngOrdtext', delimiter='\t', names=['text'])

In [3]:
# read the date file
# reshape the file in a format similar to the previous one
date_as = pd.read_csv('../data/AngOrdDate', delimiter=' ', header=None).values.ravel()
date_as = pd.DataFrame(date_as, columns=['date']).dropna()

In [4]:
id_as = pd.read_csv('../data/AngOrdID', delimiter='\t', names=['id'])

In [5]:
# merge three df together to form a larger df contains both id, date, and text
df_as = pd.concat([id_as, date_as, text_as], axis=1)

# Extract 10 Word

In [6]:
# this is a method that extract first 10 words for each text
def extract_10(text):
    
    # split the text to a list by white space
    # take the first 10 word
    text = text.split()[: 10]
    
    # join the words in list to text
    text = ' '.join(text)
    
    return text

In [7]:
# apply the function above to all text and create a new column
df_as['phrase'] = df_as.text.apply(lambda x: extract_10(x))

# Standerdize

In [8]:
word_dictionary = {
    'aelfger': ['alfger'],
    'altithroni': ['altitroni'],
    'biscop': ['bisscop', 'biscopes'],
    'christi': ['cristi'],
    'king': ['cyng', 'cynge', 'cing', 'kingc', 'kyng', 'kyngc', 'cyngc', 'cyngcc', 'kingc', 'kinge', 'cyningc'],
    'ðara': ['þara'],
    'divino': ['diuino'],
    'dominice': ['dominicae', 'dominicæ'],
    'eadgari': ['eadgar', 'edgari', 'eadgarii'],
    'edredi': ['eddredi', 'eadredi'],
    'edward': ['eadweard', 'eadward' , 'eadwardus'],
    'iesu': ['jhesu', 'jesu'],
    'incarnationis': ['incarnacionis'],
    'incertis': ['in certis'],
    'indiuidue': ['indiuiduae'],
    'inperpetuum': ['imperpetuum', 'in perpetuum', 'im perpetuum'],
    'privilegium': ['priuilegium'],
    'quamvis': ['quamuis'],
    'salvatoris': ['saluatoris'],
    'sancte': ['sanctae', 'sanctæ'],
    'seculi': ['sæculi'],
    'swutelath': ['swutelaþ', 'swutelað'],
    'thare': ['þære', 'þare'],
    'thisum': ['þissum', 'þisum'],
    'verba': ['uerba'],
    'universis': ['uniuersis']
}

In [9]:
def standerdize(sentence, word_map):
    # loop over dictionary
    for key, values in word_map.items():
        # loop over word inside value
        for word in values:
            if word in sentence:
                sentence = sentence.replace(word, key)
    # replace other letters
    word_map2 = {'æ': 'e', 'ae': 'e', 'ð': 'th', 'þ': 'th'}
    for old, new in word_map2.items():
        sentence = sentence.replace(old, new)
    return sentence      

In [10]:
df_as.phrase = df_as.phrase.apply(lambda x: standerdize(x, word_dictionary))

In [13]:
df_as.text = df_as.text.apply(lambda x: standerdize(x, word_dictionary))

In [14]:
df_as.to_csv('../data/anglo_saxon.csv', index=False)

# Norman

In [15]:
text_nas = pd.read_csv('../data/EngOrdtext', delimiter='\t', names=['text'])

# read the date file
# reshape the file in a format similar to the previous one
date_nas = pd.read_csv('../data/EngOrdDate', delimiter=' ', header=None).values.ravel()
date_nas = pd.DataFrame(date_nas, columns=['date']).dropna()

id_nas = pd.read_csv('../data/EngOrdID', delimiter='\t', names=['id'])

# merge three df together to form a larger df contains both id, date, and text
df_nas = pd.concat([id_nas, date_nas, text_nas], axis=1)

In [16]:
# extract first 10 words
df_nas['phrase'] = df_nas.text.apply(lambda x: extract_10(x))

# Standerdize

In [17]:
word_dict = {
    "Radulfo": ["radulpo", "radulpho"],
    "Ranulfo": ["ranulpo", "ranulpho"],
    "hamoni": ["haimoni"],
    "sampsoni": ["samsoni"],
    "omnibus que": ["omnibusque"],
    "londoniensi": ["lundoniensi"],
    "toti que": ["totique"],
    "gerardo": ["girardo"],
    "norwic": ["nortwic"],
    "rodbertus": ["rotbertus"],
    "ranulfus": ["ranulpus"],
    "gratia": ["gracia"],
    "iesu": ["ihesu"],
    "herefordensis": ["hereford"],
    "teobaldus": ["teodbaldus"]
}

In [18]:
df_nas.phrase = df_nas.phrase.apply(lambda x: standerdize(x, word_dict))

In [21]:
df_nas.text = df_nas.text.apply(lambda x: standerdize(x, word_dict))

In [22]:
df_nas.to_csv('../data/norman.csv', index=False)

In [33]:
df_all = pd.concat([df_as, df_nas], axis=0).reset_index(drop=True)

In [35]:
df_all.to_csv('../data/all.csv', index=False)