In [None]:
%matplotlib inline
import os
from bs4 import BeautifulSoup
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
senses_roots = set('''سمع
بصر
لمس
شمم
ذوق'''.split('\n'))

## Read in csv

In [None]:
df_total = pd.read_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh.csv')
df_agg_merged = pd.read_csv('/media/sf_VBox_Shared/Arabic/Analyses/senses_fiqh_agg.csv')

In [None]:
senses_dict = {
    u'بصر': 'see',
    u'سمع': 'hear',
    u'لمس': 'touch',
    u'شمم': 'smell',
    u'ذوق': 'taste'
}

In [None]:
senses_cols = list(senses_dict.values())
senses_cols_rel = [c+'_p' for c in senses_cols]

In [None]:
df_total.shape

In [None]:
df_total.sense.value_counts()

In [None]:
for sense in senses_dict:
    print('Top words for {} / {}'.format(sense, senses_dict[sense]))
    print(df_total[df_total.sense==senses_dict[sense]].word.value_counts().head(20))

## Per subject (school)

In [None]:
# How many books per subject?
df_agg_merged.BookSUBJ.value_counts()

In [None]:
# Plot the distributions per subject

per_subj = df_total.groupby(['BookSUBJ', 'sense' ]).size().unstack()
per_subj.plot(kind='bar', stacked=True)
plt.show()
per_subj.divide(per_subj.sum(axis=1), axis=0).plot(kind='bar', stacked=True)
plt.show()

In [None]:
for col in senses_cols_rel:
    ax = df_agg_merged.boxplot(col, by='BookSUBJ')
    ax.set_title(col)
    plt.show()

## Analysis per book

In [None]:
# How many hits do we get per book?
df_total.BookURI.value_counts()

In [None]:
df_agg_merged.set_index('BookURI')[list(senses_dict.values())].plot(kind='barh', stacked=True, figsize=(15, 10))

## Per year/century

In [None]:
import numpy as np
def cast_year(val):
    try:
        return int(str(val).split('-')[0])
    except (ValueError, TypeError):
        return np.nan


df_agg_merged['AuthorDIEDC_int'] = df_agg_merged.AuthorDIEDC.apply(cast_year)

In [None]:
df_agg_merged[senses_cols_rel].max()

In [None]:
fig, axes = plt.subplots(1)
colors = ['lightgreen', 'orange', 'black', 'red', 'lightblue']
for i in range(len(senses_cols_rel)):
    df_agg_merged.plot('AuthorDIEDC_int', senses_cols_rel[i], kind='scatter', ax=axes, c=colors[i], label=senses_cols_rel[i])

axes.set_ylim(0, 0.0022)
plt.legend()
plt.show()

## Per geographical area

In [None]:
# How many books per area?
df_agg_merged.AuthorGeographicalArea.value_counts()

In [None]:
for col in senses_cols_rel:
    ax = df_agg_merged.boxplot(col, by='AuthorGeographicalArea')
    plt.title(col)
    plt.show()