In [None]:
%matplotlib inline
import os
from bs4 import BeautifulSoup
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
fp_in = '/media/sf_VBox_Shared/Arabic/Fiqh/Fiqh-Alkhalil-csv/csv'

In [None]:
senses_roots = set('''سمع
بصر
لمس 
شمم
ذوق'''.split('\n'))

In [None]:
## Read the XML files
# metadata = []
# matched_words = []
# for dirname in os.listdir(fp_in):
#     sub_dir = os.path.join(fp_in, dirname)
#     for filename in os.listdir(sub_dir)[5:7]:
#         with open(os.path.join(sub_dir, filename)) as fn:
#             xml_data = BeautifulSoup(fn, 'xml')
#             meta_dict = {meta['name']: meta.text.strip() for meta in xml_data.metadata.find_all('meta')}
#             meta_dict['Bookname'] = dirname
#             meta_dict['Filename'] = filename
#             metadata.append(meta_dict)

#             # loop over words and match with the searched words
#             for word in xml_data.morphology_analysis.find_all('word'):
#                 roots = set([a.get('root', '') for a in word.find_all('analysis')])
#                 if not senses_roots.isdisjoint(roots):
#                     matched_words.append((filename, word.attrs, [a.attrs for a in word.find_all('analysis')]))

In [None]:
len(matched_words)

In [None]:
# # Put all results in a dataframe
# df_total = pd.DataFrame()
# for filename, word_dict, analyses in matched_words:
#     df_analyses = pd.DataFrame(analyses)
#     df_analyses['Filename'] = filename
#     for att in word_dict:
#         df_analyses[att] = word_dict[att]
#     df_total = df_total.append(df_analyses)

In [None]:
# metadata_df = pd.DataFrame(metadata).set_index('Filename')
# metadata_df.head()

In [None]:
## Read the csv files
df_total = pd.DataFrame()
for filename in os.listdir(fp_in):
    df_sub = pd.read_csv(os.path.join(fp_in, filename), index_col=0)
    df_sub = df_sub[df_sub.root.isin(senses_roots)]
    df_total = df_total.append(df_sub)

In [None]:
df_total.shape

In [None]:
df_total.head()

In [None]:
df_total.root.value_counts()

In [None]:
df_total.word.value_counts().head(20)

In [None]:
# For each of the words, what is the number of roots?
df_total.groupby('word').nunique()['root'].sort_values(ascending=False).head()

## Merge with newer meta data

In [None]:
metadata_fields = ['BookURI', 'Century', 'AuthorNAME', 'AuthorGeographicalArea', 'AuthorBORNH', 'AuthorBORNC', 'AuthorDIEDH', 'AuthorDIEDC',  'BookSUBJ', 'NumberOfTokens']

In [None]:
metadata_new = pd.read_csv('/media/sf_VBox_Shared/Arabic/Fiqh/merged_metadata.csv')

In [None]:
metadata_new['Bookname'] = metadata_new.filename_old.str.extract('(.*)\.txt', expand=False)

In [None]:
#metadata_merged = metadata_df['Bookname'].reset_index().merge(metadata_new, left_on='Bookname', right_on='Bookname', how='left')
metadata_merged = metadata_new[['Bookname']+metadata_fields].copy()

In [None]:
metadata_df.shape, metadata_new.shape, metadata_merged.shape

In [None]:
metadata_merged.columns

## Prepare csv

In [None]:
# Filter only the analyses with the roots that we are interested in


In [None]:
tr_dict = {s['root']: s['tr_root'] for i, s in df_total[['root', 'tr_root']].drop_duplicates().iterrows()}
tr_dict

In [None]:
df_merged = df_total.merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname', 'title'], axis=1)

In [None]:
df_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_test.csv', index=False)

In [None]:
# Also prepare aggregated csv
df_agg = df_total.groupby(['title', 'root']).size().unstack(fill_value=0)
df_agg.columns = [u'{} ({})'.format(c, tr_dict[c]) for c in df_agg.columns]

In [None]:
df_agg_merged = df_agg.reset_index().merge(metadata_merged, left_on='title', right_on='Bookname', how='left').drop(['Bookname'], axis=1)

In [None]:
senses_cols = df_agg.columns
df_agg_merged[senses_cols] = df_agg_merged.apply(lambda r: r[senses_cols]/r['NumberOfTokens'], axis=1)

In [None]:
df_agg_merged.to_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_agg_test.csv')

In [None]:
df_agg_merged.head()

In [None]:
import numpy as np
def cast_year(val):
    try:
        return int(str(val).split('-')[0])
    except (ValueError, TypeError):
        return np.nan


df_agg_merged['AuthorDIEDC_int'] = df_agg_merged.AuthorDIEDC.apply(cast_year)

In [None]:
df_agg_merged[senses_cols].max()

In [None]:
fig, axes = plt.subplots(1)
colors = ['lightgreen', 'yellow', 'black', 'red']
for i in range(len(senses_cols)):
    df_agg_merged.plot('AuthorDIEDC_int', senses_cols[i], kind='scatter', ax=axes, c=colors[i], label=senses_cols[i])

axes.set_ylim(0, 0.0022)
plt.legend()
plt.show()

In [None]:
for col in senses_cols:
    ax = df_agg_merged.boxplot(col, by='AuthorGeographicalArea')
    ax.set_title(col)
    plt.show()

In [None]:
# UTF encoding stuff
arab_text = 'سمع'
latin_text = 'smE'
ltr_char = '\u200E'
rtl_char = '\u061C'
text1 = '{}: ({})'.format(arab_text, latin_text)
text2 = '{}{}{}: ({})'.format(rtl_char, arab_text, ltr_char, latin_text)

In [None]:
print(text1)
print(text2)

In [None]:
fig, axes = plt.subplots(2,2)
axes[0,0].set_title(latin_text)
axes[1,0].set_title(arab_text)
axes[0,1].set_title(text1)
axes[1,1].set_title(text2)

for i in range(2):
    for j in range(2):
        axes[i,j].set_xticks([])
plt.show()

In [None]:
ax