In [20]:
import pandas as pd
import numpy as np
import re
import openpyxl

In [2]:
df_hans = pd.read_csv('data/NFS8-Hans.tsv', sep = '\t')
df_asitane = pd.read_csv('data/NFS8-Asitane.tsv', sep = '\t')
df_galata = pd.read_csv('data/NFS8-Galata.tsv', sep = '\t')
df_yenikoy = pd.read_csv('data/NFS8-Yeniköy_İstinye_Emirgan.tsv', sep = '\t')

In [3]:
#Run this only once
df_galata.columns = df_galata.iloc[0]
df_galata = df_galata.drop(0, axis = 0)
df_galata = df_galata.loc[:, df_galata.columns.notnull()]

In [4]:
df_asitane.head(5)

Unnamed: 0,Register,Date,Unique person ID#,Entry code (page#/line/entry),Guarantor (Kefil) name(s),Guarantor (Kefil) religion (Muslim or Christian),Han Nazırı name(s),Han Name,Descriptor/location of Han,Neighborhood,...,Unnamed: 37,church position,church name,name of weapon,number of weapons,name of weapons,name of weapon.1,other info,other info.1,comments/intuitive info
0,NFS.d_008,H 29-12-1236,,4A1,,,,Kürkçü Han,Mahmud Paşa kurbunda,,...,,,,,,,,,,
1,8,H 29-12-1236,,,,,,Kürkçü Han,Mahmud Paşa kurbunda,,...,,,,,,,,,,
2,8,H 29-12-1236,,,,,,Kürkçü Han,Mahmud Paşa kurbunda,,...,,,,,,,,,,
3,8,H 29-12-1236,,,,,,Kürkçü Han,Mahmud Paşa kurbunda,,...,,,,,,,,,,
4,8,,,,,,,Kürkçü Han,Mahmud Paşa kurbunda,,...,,,,,,,,,,


In [5]:
#Asitane Profession Normalization
df_asitane['Profession'] = df_asitane['Profession'].str.lower()
df_asitane['Profession With ?'] = df_asitane['Profession'].str.contains(r'\?')
df_asitane['Profession With *'] = df_asitane['Profession'].str.contains(r'\*')
df_asitane['Profession'] = df_asitane['Profession'].str.replace('[\*\(\)\?)]', '')
df_asitane['Profession'] = df_asitane['Profession'].str.replace('[\s]*', '')

  df_asitane['Profession'] = df_asitane['Profession'].str.replace('[\*\(\)\?)]', '')
  df_asitane['Profession'] = df_asitane['Profession'].str.replace('[\s]*', '')


In [6]:
#Yenikoy Profession Normalization
df_yenikoy['Profession'] = df_yenikoy['Profession'].str.lower()
df_yenikoy['Profession With ?'] = df_yenikoy['Profession'].str.contains(r'\?')
df_yenikoy['Profession With *'] = df_yenikoy['Profession'].str.contains(r'\*')
df_yenikoy['Profession'] = df_yenikoy['Profession'].str.replace('[\*\(\)\?)]', '')
df_yenikoy['Profession'] = df_yenikoy['Profession'].str.replace('[\s]*', '')

  df_yenikoy['Profession'] = df_yenikoy['Profession'].str.replace('[\*\(\)\?)]', '')
  df_yenikoy['Profession'] = df_yenikoy['Profession'].str.replace('[\s]*', '')


In [7]:
places = ['asitane', 'galata', 'yenikoy']
dfs = [df_asitane, df_galata, df_yenikoy]

In [8]:
df_asitane['Han Name']

0     Kürkçü Han
1     Kürkçü Han
2     Kürkçü Han
3     Kürkçü Han
4     Kürkçü Han
5     Kürkçü Han
6     Kürkçü Han
7     Kürkçü Han
8     Kürkçü Han
9     Kürkçü Han
10    Kürkçü Han
11    Kürkçü Han
12    Kürkçü Han
13    Kürkçü Han
14    Kürkçü Han
15    Kürkçü Han
16    Kürkçü Han
17    Kürkçü Han
18    Kürkçü Han
19    Kürkçü Han
20    Kürkçü Han
21    Kürkçü Han
Name: Han Name, dtype: object

In [9]:
#Getting rid of repeated han names
unique_hans = df_hans['Name of han'].unique()
df_hans = df_hans.drop_duplicates(subset = ['Name of han'], keep = 'first') #Before this, the hands dataframe had 176 rows but 172 unique han names
df_hans.shape

(172, 9)

In [10]:
#Split of muharrir 
split_on = r"\([\d*]\)"
muharrir_split = df_yenikoy.loc[df_yenikoy['muharrir'].str.contains(r"\([\d*]\)", na= False)]['muharrir'].str.split(split_on) #Doing split in only parenthesis formatted 
muharrir_split = muharrir_split.str[1:]
split_on_ve = r" ve "
muharrir_split_ve = df_yenikoy.loc[df_yenikoy['muharrir'].str.contains(r" ve ", na= False)]['muharrir'].str.split(split_on_ve)
muharrir_split_general = pd.concat([muharrir_split, muharrir_split_ve])
muharrir_split_general

0      [ Kürkçü Sinan Çorbacı ,  Simsar Balaş Çorbacı...
1      [ Kürkçü Sinan Çorbacı ,  Simsar Balaş Çorbacı...
2      [ Kürkçü Sinan Çorbacı ,  Simsar Balaş Çorbacı...
3      [ Kürkçü Sinan Çorbacı ,  Simsar Balaş Çorbacı...
4      [ Kürkçü Sinan Çorbacı ,  Simsar Balaş Çorbacı...
                             ...                        
705    [Panayot, Papa Iskuli, Zaferi, Yorgi, Dimitri,...
706    [Panayot, Papa Iskuli, Zaferi, Yorgi, Dimitri,...
707    [Panayot, Papa Iskuli, Zaferi, Yorgi, Dimitri,...
708    [Panayot, Papa Iskuli, Zaferi, Yorgi, Dimitri,...
709    [Panayot, Papa Iskuli, Zaferi, Yorgi, Dimitri,...
Name: muharrir, Length: 631, dtype: object

In [11]:
df_muharrir = pd.DataFrame(muharrir_split_general)
df_all_muh = pd.DataFrame(df_muharrir['muharrir'].to_list(), index = df_muharrir.index,  columns = ['Muharrir 1', 'Muharrir 2', 'Muharrir 3', 'Muharrir 4', 'Muharrir 5', 'Muharrir 6', 'Muharrir 7' , 'Muharrir 8', 'Muharrir 9', 'Muharrir 10', 'Muharrir 11'])
df_yenikoy = df_yenikoy.merge(df_all_muh, left_index = True, right_index = True)
df_yenikoy = df_yenikoy.drop('muharrir', axis = 1)

In [12]:
df_yenikoy = df_yenikoy.fillna('uknown')

In [13]:
df_galata = df_galata.fillna('uknown')

In [14]:
#Function for making sure turkish characters are lowered in the right way.
def lower_tr(string):
    string_copy = list(string)
    string_copy2 = string_copy[:]
    for i in range(len(string_copy)):
        if string_copy[i] =='I':
            element = 'ı'
            string_copy2[i] = element
        elif string_copy[i] == "İ":
            element = 'i'
            string_copy2[i] == element
        else:
            element = string_copy[i].lower()
            string_copy2[i] = element
    return ''.join(string_copy2)

In [15]:
'A'.lower()

'a'

In [16]:
lower_tr('aAaa')

'aaaa'

In [17]:
professions_yenikoy = df_yenikoy['Profession']
professions_galata = df_galata['Profession']
professions_all = professions_yenikoy.append(professions_galata)
professions_all = professions_all.reset_index()
professions_all = professions_all.drop('index', axis = 1)
professions_all = professions_all.drop_duplicates(subset = ['Profession']).reset_index().drop('index', axis =1)
professions_all['Profession'] = professions_all['Profession'].str.replace(r'\([\w\d\siİıIçÇşŞüÜğĞöÖ]*\)', '')
professions_all_edited = professions_all['Profession'].str.replace(r'[^\sa-zA-ZğĞüÜşŞİöÖçÇı]*', '')
professions_all_edited = professions_all_edited.apply(lower_tr)
professions_all_edited = professions_all_edited.drop_duplicates()
professions_all_edited = professions_all_edited.str.strip()
professions_all_edited = professions_all_edited.loc[professions_all_edited != ''].reset_index().drop('index', axis =1).squeeze()

  professions_all['Profession'] = professions_all['Profession'].str.replace(r'\([\w\d\siİıIçÇşŞüÜğĞöÖ]*\)', '')
  professions_all_edited = professions_all['Profession'].str.replace(r'[^\sa-zA-ZğĞüÜşŞİöÖçÇı]*', '')


In [18]:
professions_all_edited.head(100)

0                suyolcu
1               yemenici
2                kayıkçı
3              nasraniye
4            kasapustası
             ...        
95     yeminecikalfaları
96    kayıkçıturbitorunu
97         kürkçükürekçi
98                  papa
99                 abaci
Name: Profession, Length: 100, dtype: object

In [21]:
professions_all_edited.to_excel('professions.xlsx')