In [1]:
from collections import Counter
import json
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd

# Language Analysis results of book from the 1890's

In this notebook we hope to answer the following:

1. What is the distribution of languages that the book's were written in?
2. Are there any books whose volume have different languages? 
3. What is the distribution and average page length of these books?
4. Does the number of pages change the distribution of languages?

## Distribution of languages in the 1890's books

In [2]:
def create_language_dataframe(language_count: Dict[str, str]) -> pd.DataFrame:
    '''
    :param language_count: A dictionary where the keys are language names and 
                           the values are the number of times the language 
                           has occured.
    :returns: A dataframe with three columns: `Language`, `Percentage (%)`, and 
              3. `Number of books`. Whereby this describes the number of 
              books that are in that language and the percentage of all books 
              that are in that language.
    '''
    total_language_count = sum(language_count.values())
    normalised_language_count = {key: (value / total_language_count) * 100 
                             for key, value in language_count.items()}
    language_count_data = {"Language": [], "Percentage (%)": [], 
                        "Number of books": []}
    for language, percentage in normalised_language_count.items():
        language_count_data['Language'].append(language)
        language_count_data['Percentage (%)'].append(round(percentage, 2))
        language_count_data['Number of books'].append(language_count[language])
    language_count_data_df = pd.DataFrame(language_count_data)
    return language_count_data_df.sort_values('Percentage (%)', ascending=False, 
                                              ignore_index=True)
    


# import the language id results
language_id_results_path = Path('.', 'language_results_1890.json').resolve()
results: List[Dict[str, Any]] = []
language_count = Counter()
with language_id_results_path.open('r') as results_fp:
    for line in results_fp:
        line = line.strip()
        if line:
            result = json.loads(line)
            results.append(result)
            language_count.update([result['language']])
number_books_processed = len(results)
print(f'Number of books that have been processed: {number_books_processed}')
create_language_dataframe(language_count)

Number of books that have been processed: 14280


Unnamed: 0,Language,Percentage (%),Number of books
0,English,79.96,11419
1,French,7.19,1027
2,German,5.58,797
3,Spanish,1.4,200
4,Italian,1.28,183
5,Dutch,0.83,118
6,Hungarian,0.68,97
7,Russian,0.67,96
8,Danish,0.57,82
9,Swedish,0.53,76


As we can see from the results above, the majority of the books are English, but ~20% of the books are in another language, of which French is the second highest with 7.19% of the books.

## Do any of the books have volumes that are in different identified languages?

In [3]:
book_identifiers: List[str] = []
book_volumes: List[str] = []
languages: List[str] = []

for result in results:
    languages.append(result['language'])
    file_name = result['filename']
    identifier, volumne = file_name.split('_')[:2]
    book_identifiers.append(identifier)
    book_volumes.append(volumne)
volume_language_book_df = pd.DataFrame({'Language': languages, 
                                        'Identifier': book_identifiers, 
                                        'Volume': book_volumes})
volume_language_book_df = volume_language_book_df.groupby('Identifier').nunique()
volume_language_book_df[volume_language_book_df['Language'] > 1]

Unnamed: 0_level_0,Language,Volume
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
228355,2,2
459957,2,4
624024,2,2
1671636,2,2
1691448,2,5
2654658,2,2
2722435,2,2
3376382,2,2


As we can see, there are 8 books that have volumes have been identified in two different languages.

## Distribution and average page length of the 1890's books

In [4]:
rounded_page_counter = Counter()
list_of_page_numbers = []
for result in results:
    page_count = 0
    for language_details in result['language_extras'].values():
        page_count += int(language_details['count'])
    list_of_page_numbers.append(page_count)
    page_count = (page_count // 10) * 10
    rounded_page_counter.update([page_count])

total_page_count = sum(rounded_page_counter.values())
df_number_pages = []
page_counts = []
page_count_percentage = []
for number_pages, page_count in rounded_page_counter.items():
    df_number_pages.append(number_pages)
    page_counts.append(page_count)
    page_count_percentage.append(round((page_count / total_page_count) * 100, 2))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    page_count_df = pd.DataFrame({'Number pages': df_number_pages, 'Count': page_counts, 'Percentage (%)': page_count_percentage})
    page_count_df = page_count_df.sort_values('Number pages')
    page_count_df['Cumulative Percentage (%)'] = page_count_df['Percentage (%)'].cumsum()
    page_count_df = page_count_df.set_index('Number pages')
    display(page_count_df.T)

Number pages,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,750,760,770,780,790,800,810,820,830,840,850,860,870,880,890,900,910,920,930,940,950,960,980,990,1000,1010,1020,1030,1040,1050,1060,1070,1080,1090,1100,1110,1120,1130,1140,1150,1160,1170,1180,1190,1200,1210,1220,1240,1270,1300,1370,1380,1420,1530,1600,1790
Count,109.0,195.0,182.0,188.0,141.0,176.0,163.0,152.0,168.0,186.0,197.0,175.0,226.0,213.0,185.0,188.0,195.0,196.0,211.0,234.0,251.0,248.0,294.0,329.0,379.0,465.0,395.0,393.0,461.0,477.0,543.0,629.0,425.0,435.0,352.0,389.0,312.0,257.0,288.0,252.0,211.0,210.0,151.0,158.0,183.0,143.0,136.0,131.0,123.0,108.0,99.0,90.0,82.0,74.0,72.0,50.0,59.0,62.0,48.0,58.0,41.0,43.0,36.0,32.0,42.0,26.0,27.0,44.0,50.0,33.0,25.0,26.0,22.0,24.0,13.0,15.0,25.0,18.0,20.0,19.0,17.0,13.0,9.0,10.0,14.0,10.0,6.0,5.0,2.0,10.0,8.0,4.0,4.0,7.0,5.0,6.0,4.0,1.0,6.0,5.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,1.0,3.0,1.0,3.0,2.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Percentage (%),0.76,1.37,1.27,1.32,0.99,1.23,1.14,1.06,1.18,1.3,1.38,1.23,1.58,1.49,1.3,1.32,1.37,1.37,1.48,1.64,1.76,1.74,2.06,2.3,2.65,3.26,2.77,2.75,3.23,3.34,3.8,4.4,2.98,3.05,2.46,2.72,2.18,1.8,2.02,1.76,1.48,1.47,1.06,1.11,1.28,1.0,0.95,0.92,0.86,0.76,0.69,0.63,0.57,0.52,0.5,0.35,0.41,0.43,0.34,0.41,0.29,0.3,0.25,0.22,0.29,0.18,0.19,0.31,0.35,0.23,0.18,0.18,0.15,0.17,0.09,0.11,0.18,0.13,0.14,0.13,0.12,0.09,0.06,0.07,0.1,0.07,0.04,0.04,0.01,0.07,0.06,0.03,0.03,0.05,0.04,0.04,0.03,0.01,0.04,0.04,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.01,0.02,0.01,0.02,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
Cumulative Percentage (%),0.76,2.13,3.4,4.72,5.71,6.94,8.08,9.14,10.32,11.62,13.0,14.23,15.81,17.3,18.6,19.92,21.29,22.66,24.14,25.78,27.54,29.28,31.34,33.64,36.29,39.55,42.32,45.07,48.3,51.64,55.44,59.84,62.82,65.87,68.33,71.05,73.23,75.03,77.05,78.81,80.29,81.76,82.82,83.93,85.21,86.21,87.16,88.08,88.94,89.7,90.39,91.02,91.59,92.11,92.61,92.96,93.37,93.8,94.14,94.55,94.84,95.14,95.39,95.61,95.9,96.08,96.27,96.58,96.93,97.16,97.34,97.52,97.67,97.84,97.93,98.04,98.22,98.35,98.49,98.62,98.74,98.83,98.89,98.96,99.06,99.13,99.17,99.21,99.22,99.29,99.35,99.38,99.41,99.46,99.5,99.54,99.57,99.58,99.62,99.66,99.67,99.69,99.7,99.71,99.72,99.73,99.74,99.75,99.78,99.79,99.81,99.82,99.84,99.85,99.86,99.87,99.89,99.9,99.91,99.92,99.93,99.94,99.95,99.96,99.97,99.98,99.99,100.0,100.01,100.02,100.03


In [5]:
pd.Series(list_of_page_numbers).describe()

count    14280.000000
mean       301.701891
std        172.133592
min          1.000000
25%        194.000000
50%        295.000000
75%        379.000000
max       1798.000000
dtype: float64

As we can see the mean number of pages in a book is around 300 and 50% of books have between 194 and 379 pages.

However we can see that around 5.7% of all books have less than 50 pages, which I found unusal, but more unusual is that 0.76% of books have less than 10 pages. Below we plot the number of books by page count for books with less than 10 pages:



In [6]:
page_counts_less_than_10 = [page_number for page_number in list_of_page_numbers if page_number < 10]
for page_number, count in sorted(Counter(page_counts_less_than_10).items()):
    print(f'Number pages: {page_number}, Count: {count}')

Number pages: 1, Count: 3
Number pages: 2, Count: 5
Number pages: 3, Count: 15
Number pages: 4, Count: 18
Number pages: 5, Count: 8
Number pages: 6, Count: 12
Number pages: 7, Count: 17
Number pages: 8, Count: 9
Number pages: 9, Count: 22


From this we can see that 3 books have 1 page, which is unusual I think for a book, the identifiers for the books and their volumne number can be seen below:

In [7]:
identifier_for_1_page_books = []
volumne_number_for_1_page_books = []
for result in results:
    page_count = 0
    for language_details in result['language_extras'].values():
        page_count += int(language_details['count'])
    if page_count == 1:
        file_name = result['filename']
        identifier, volumne, _ = file_name.split('_')
        identifier_for_1_page_books.append(identifier)
        volumne_number_for_1_page_books.append(volumne)

for identifier, volumne_number in zip(identifier_for_1_page_books, volumne_number_for_1_page_books):
    print(f'Identifier: {identifier}, Volumne number: {volumne_number}')

Identifier: 000624024, Volumne number: 02
Identifier: 000396864, Volumne number: 01
Identifier: 000522164, Volumne number: 01


Exploring the British Library collection online:

1. 000522164 -- is "In Memory of Prince Albert Victor Edward of England, Duke of Clarence, died 14 Jan. 1892. [Verses.]", which is indeed one page long and can be found [here](http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=2&tabs=moreTab&ct=display&fn=search&doc=BLL01014608183&indx=1&recIds=BLL01014608183&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=2&frbg=&&dscnt=0&scp.scps=scope%3A%28BLCONTENT%29&vl(2084770704UI0)=any&tb=t&vid=BLVU1&mode=Basic&srt=rank&tab=local_tab&dum=true&vl(freeText0)=In%20Memory%20of%20Prince%20Albert%20Victor%20Edward%20of%20England%2C%20Duke%20of%20Clarence%2C%20died%2014%20Jan.%201892.&dstmp=1627839394107)
2. 000396864 -- is "Song. The Life of Love is but a Day", which again is one page long and can be found [here](http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=2&tabs=moreTab&ct=display&fn=search&doc=BLL01014635279&indx=1&recIds=BLL01014635279&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=2&frbg=&&dscnt=0&scp.scps=scope%3A%28BLCONTENT%29&vl(2084770704UI0)=any&tb=t&vid=BLVU1&mode=Basic&srt=rank&tab=local_tab&dum=true&vl(freeText0)=Song.%20The%20Life%20of%20Love%20is%20but%20a%20Day&dstmp=1627839546003)
3. 000624024 -- is "Dieci anni in Equatoria e ritorno con Emin Pascia ... Con ... illustrazioni e ... carte, etc. [With a portrait.]']", but as it is in volume 2 I cannot verify if it is one page as I can only find volume one [online](http://explore.bl.uk/primo_library/libweb/action/display.do?tabs=moreTab&ct=display&fn=search&doc=BLL01014839572&indx=1&recIds=BLL01014839572&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=&dscnt=0&vl(2084770704UI0)=any&mode=Basic&vid=BLVU1&tab=available_online&dstmp=1627842397007&frbg=&frbrVersion=&viewAllItemsClicked=false&scp.scps=scope%3A%28BLWEBSITE%29%2Cscope%3A%28BLO_WA%29%2Cscope%3A%28BLO_Aleph%29%2Cscope%3A%28BLO_SFX%29%2Cscope%3A%28BLO_SAMI%29&tb=t&srt=rank&dum=true&selectedLocation=&vl(freeText0)=Dieci%20anni%20in%20Equatoria%20e%20ritorno%20con%20Emin%20Pascia%20...%20Con%20...%20illustrazioni%20e%20...%20carte%2C%20etc.%20%5BWith%20a%20portrait.).

## Page length and the affect it may cause on language identification

Having fewer pages in a book might be affecting the outcome of the language identification model. To test this we are going to group the books by page counts, where page counts will be round down to the neareast 10 pages e.g. less than 9 pages will be grouped to 0 and less than 19 grouped to 10, this grouping will be done for all book with less than 50 pages. Once grouped we will see that the distribution of languages are for those page count groups:

In [8]:
def filter_books_by_page_count(books: List[Dict[str, Any]], 
                               min_number_pages: int,
                               max_number_pages: int
                               ) -> List[Dict[str, Any]]:
    filtered_results: List[Dict[str, Any]] = []
    for book in books:
        page_count = 0
        for language_details in book['language_extras'].values():
            page_count += int(language_details['count'])
        if (page_count >= min_number_pages) and (page_count <= max_number_pages):
            filtered_results.append(book)
    return filtered_results

def books_by_page_count_group(books: List[Dict[str, Any]], min_page_count: int, 
                              max_page_count: int) -> List[str]:
    language_of_books: List[str] = []
    for book in books:
        page_count = 0
        for language_details in book['language_extras'].values():
            page_count += int(language_details['count'])
        if (page_count >= min_page_count) and (page_count <= max_page_count):
            language_of_books.append(book['language'])
    return language_of_books

filtered_results = filter_books_by_page_count(results, 0, 49)

### 0-9 pages

In [9]:

create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 0, 9)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,91.74,100
1,French,5.5,6
2,Russian,0.92,1
3,Spanish,0.92,1
4,Danish,0.92,1


### 10-19 pages

In [10]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 10, 19)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,86.15,168
1,French,7.18,14
2,German,2.05,4
3,Dutch,2.05,4
4,Italian,1.54,3
5,Russian,1.03,2


### 20-29 pages

In [11]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 20, 29)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,81.32,148
1,French,7.69,14
2,German,3.85,7
3,Italian,3.3,6
4,Spanish,1.1,2
5,Dutch,1.1,2
6,Greek,0.55,1
7,Portuguese,0.55,1
8,Swedish,0.55,1


### 30-39 pages

In [12]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 30, 39)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,82.45,155
1,French,5.85,11
2,German,4.79,9
3,Spanish,2.13,4
4,Italian,1.6,3
5,Dutch,1.06,2
6,Latin,1.06,2
7,Portuguese,0.53,1
8,Russian,0.53,1


### 40-49 pages

In [13]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 40, 49)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,84.4,119
1,French,6.38,9
2,German,4.26,6
3,Italian,2.84,4
4,Spanish,0.71,1
5,Swedish,0.71,1
6,Greek,0.71,1


### All other pages

In [14]:
create_language_dataframe(Counter(books_by_page_count_group(results, 50, 2000)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,79.68,10729
1,French,7.23,973
2,German,5.73,771
3,Spanish,1.43,192
4,Italian,1.24,167
5,Dutch,0.82,110
6,Hungarian,0.72,97
7,Russian,0.68,92
8,Danish,0.6,81
9,Swedish,0.55,74


### Conclusion of page length affecting language identification

We can see that the language idenitifcation distribution does not change much when we compare books that have 50 pages or more to those with less. Even in the extreme case of less than 10 pages the most common language is English and the second most common is French, which is what we see in the results of those books with more than 50 pages. However with fewer pages there does seem to be more books that are English. Lastly this type of analysis does not determine that the language identification model is more accurate on fewer page it just shows that there is nothing abnormal occuring in books with fewer pages such as OCR errors on the title and index pages (which might make up a larger percentage of the book) affecting the language idenitification result. 