In [1]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
from scipy.sparse import coo_matrix, csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
df = pd.read_csv('mishnah.csv')
df.drop(columns='Unnamed: 0',inplace=True)
tannaim = ['Shimon of Teman','Shimon the Reighteous','Antigonus a man of Socho','Yose ben Yoezer','Yose ben Yohanan',
          'Yehoshua ben Perahiah','Nittai the Arbelite','Yehuda ben Tabbai','Shimon ben Shetah','Shemaiah and Avtalion',
          'Hillel','Shammai','Gamliel the Elder','Shimon ben Gamliel','Yohanan ben Zakkai','Eliezer ben Hyrcanus',
          'Akiva','Shimon ben Gamliel','Meir','Yehuda','Akavia ben Mahalalel','Bava ben Buti','Ben He He',
          'Dosa ben Harkinas','Rabbi Hanina the vice-chief of the priests','Hanina ben Dosa','Zadok','Yehudah ben Batera',
          'Elazar ben Arach','Elazar ben Tzadok','Eliezer','Eliezer ben Yaakov','Nehunia ben Hakkanah','Shmuel Hakatan',
          'Tarfon','Yehoshua','Yose the priest','Abba Shaul','Elazar ben Azaria','Elazar of Modiin','Elisha ben Abuyah','Halafta',
          'Hanania ben Hakinai','Hanania ben Teradion','Ilai','Shimon ben Assai','Shimon ben Nannas','Shimon ben Zoma',
          'Yehuda ben Baba','Yehuda ben Batera','Ishmael','Yohanan ben Beroka','Yohanan ben Nuri','Yose ben Kisma',
          'Yose the Galilean','Elazar','Elazar ben Zadok','Hanina ben Gamliel','Meir','Natan','Rabbi Nehemia',
          'Shimon','Yehoshua ben Korha','Yohanan Hasandlar','Yonatan','Yose','Pinchas ben Yair','Shimon ben Elazar',
          'Shimon ben Halafta','Shimon ben Menasya','Shimon ben Yehuda','Yose ben Meshullam','Yose ben Yehuda',
          'Shimon ben Yehuda']

In [12]:
def clean_names(row):
    row = row.replace('Gamaliel','Gamliel')
    row = row.replace('Akiba','Akiva')
    row = row.replace('Yosei','Yose')
    row = row.replace('Jacob','Yaakov')
    row = row.replace('Azaryah','Azaria')
    row = row.replace('Azarya','Azaria')
    row = row.replace('Azariah','Azaria')
    row = row.replace('Joshua','Yehoshua')
    row = row.replace('Yehudah','Yehuda')
    row = row.replace('Judah','Yehuda')
    row = row.replace('Neḥemya','Nehemia')
    row = row.replace('Nehemiah','Nehemia')
    row = row.replace('Tzadok','Zadok')
    row = row.replace('Ḥanina','Hanina')
    row = row.replace('Haninah','Hanina')
    row = row.replace('Akavyah','Akavia')
    row = row.replace('Akaviah','Akavia')
    row = row.replace('Shetach','Shetah')
    row = row.replace('Abtalion','Avtalion')
    row = row.replace('Yochanan','Yohanan')
    row = row.replace('Zakai','Zakkai')
    row = row.replace('Jonathan','Yonatan')
    row = row.replace('Samuel','Shmuel')
    row = row.replace('Nanas','Nannas')
    row = row.replace('Bava','Baba')
    row = row.replace('Ishmael','Yishmael')
    row = row.replace('Berokah','Beroka')
    row = row.replace('Rabbi Hanina vice-chief of the priests','Rabbi Hanina the vice-chief of the priests')
    row = row.replace('HaGelili','the Galilean')
    row = row.replace('Shimon b. Nannas','Shimon ben Nannas')
    row = row.replace('Dosa b. Harkinas','Dosa ben Harkinas')
    row = row.replace('Korhah','Korha')
    row = row.replace('Korḥa','Korha')
    row = row.replace('Ha-Sandelar','Hasandlar')
    return row

df['text'] = df.text.apply(lambda row: clean_names(row))

In [33]:
def get_tannaim(row):
    authors = []
    for name in tannaim:
        if (name in row)and(name not in authors):
            authors.append(name)
    for name in authors:
        if (len(name.split(' '))>1)and(name.split(' ')[0] in authors)and(row.index(name)==row.index(name.split(' ')[0])):
            authors.pop(authors.index(name.split(' ')[0]))
    return authors,len(authors)
    
df[['tannaim','tanna_count']] = df.text.apply(lambda row: pd.Series(get_tannaim(row)))

In [36]:
df['spacy_doc'] = list(nlp.pipe(df.text))

In [39]:
def tokenize(row):
    new_text = ''
    for token in row:
        #if (token.pos_=='NOUN') or (token.pos_=='PROPN') or (token.pos_=='ADJ'):
        #if (token.pos_=='PROPN'):
        if token.lemma_ != '-PRON-':
            new_text+=token.lemma_+' '
        else:
            new_text+=token.text+' '
    return new_text
    
df['tokens'] = df['spacy_doc'].apply(lambda row: tokenize(row))

In [40]:
df.head()

Unnamed: 0,tractate,chapter,mishnah,text,seder,tannaim,tanna_count,spacy_doc,tokens
0,Berakhot,1,1,From what time may one recite the Shema in the...,Zeraim,[Eliezer],1,"(From, what, time, may, one, recite, the, Shem...",from what time may one recite the Shema in the...
1,Berakhot,1,2,From what time may one recite the Shema in the...,Zeraim,"[Eliezer, Yehoshua]",2,"(From, what, time, may, one, recite, the, Shem...",from what time may one recite the Shema in the...
2,Berakhot,1,3,Bet Shammai say: in the evening every man shou...,Zeraim,"[Hillel, Shammai, Tarfon]",3,"(Bet, Shammai, say, :, in, the, evening, every...",Bet Shammai say : in the evening every man sho...
3,Berakhot,1,4,In the morning he recites two blessings before...,Zeraim,[],0,"(In, the, morning, he, recites, two, blessings...",in the morning he recite two blessing before i...
4,Berakhot,1,5,They mention the Exodus from Egypt at night. R...,Zeraim,[Elazar ben Azaria],1,"(They, mention, the, Exodus, from, Egypt, at, ...",They mention the Exodus from Egypt at night . ...


In [43]:
df.to_csv('mishnah.csv')