In [1]:
from collections import Counter
import json
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd

# Language Analysis results for all British Library books

In this notebook we hope to answer the following:

1. What is the distribution of languages that the book's were written in?
2. Are there any books whose volume have different languages? 
3. What is the distribution and average page length of these books? For all pages that contain text, some books contain no pages that have text only pages that contain maps. In cases where the book contains pages with no text the page length will be zero.
4. Does the number of pages change the distribution of languages?

## Distribution of languages in the books

In [2]:
def create_language_dataframe(language_count: Dict[str, str]) -> pd.DataFrame:
    '''
    :param language_count: A dictionary where the keys are language names and 
                           the values are the number of times the language 
                           has occured.
    :returns: A dataframe with three columns: `Language`, `Percentage (%)`, and 
              3. `Number of books`. Whereby this describes the number of 
              books that are in that language and the percentage of all books 
              that are in that language.
    '''
    total_language_count = sum(language_count.values())
    normalised_language_count = {key: (value / total_language_count) * 100 
                             for key, value in language_count.items()}
    language_count_data = {"Language": [], "Percentage (%)": [], 
                        "Number of books": []}
    for language, percentage in normalised_language_count.items():
        language_count_data['Language'].append(language)
        language_count_data['Percentage (%)'].append(round(percentage, 2))
        language_count_data['Number of books'].append(language_count[language])
    language_count_data_df = pd.DataFrame(language_count_data)
    return language_count_data_df.sort_values('Number of books', ascending=False, 
                                              ignore_index=True)
    


# import the language id results
language_id_results_path = Path('.', 'language_results.json').resolve()
results: List[Dict[str, Any]] = []
language_count = Counter()
with language_id_results_path.open('r') as results_fp:
    for line in results_fp:
        line = line.strip()
        if line:
            result = json.loads(line)
            results.append(result)
            language_count.update([result['language']])
number_books_processed = len(results)
print(f'Number of books that have been processed: {number_books_processed}')
create_language_dataframe(language_count)

Number of books that have been processed: 63984


Unnamed: 0,Language,Percentage (%),Number of books
0,English,76.53,48968
1,French,8.2,5244
2,German,6.68,4277
3,Spanish,1.48,948
4,Italian,1.37,874
5,Dutch,1.3,832
6,Russian,1.25,800
7,Hungarian,0.61,392
8,Swedish,0.58,369
9,Danish,0.54,343


As we can see from the results above, the majority of the books are English, but ~23% of the books are in another language, of which French is the second highest with 8.20% of the books. **Note** we can also see that 3 of the books have a language of `None`, which means that those books did not have any text.

## Do any of the books have volumes that are in different identified languages?

In [3]:
book_identifiers: List[str] = []
book_volumes: List[str] = []
languages: List[str] = []

for result in results:
    languages.append(result['language'])
    file_name = result['filename']
    identifier, volumne = file_name.split('_')[:2]
    book_identifiers.append(identifier)
    book_volumes.append(volumne)
volume_language_book_df = pd.DataFrame({'Language': languages, 
                                        'Identifier': book_identifiers, 
                                        'Volume': book_volumes})
volume_language_book_df = volume_language_book_df.groupby('Identifier').nunique()
volume_language_book_df[volume_language_book_df['Language'] > 1].sort_values('Language')

Unnamed: 0_level_0,Language,Volume
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
000097249,2,2
001931087,2,2
002046329,2,3
002069803,2,15
002377189,2,2
...,...,...
001401151,2,3
001427051,3,11
003508992,3,3
003156564,3,4


As we can see, there are 69 books that have volumes that have more than one identified language. Some of the books have at least 3 different languages identified based on their volumes.

## Distribution and average page length of the books

In these distribution statistics we do not count pages that contain no text, any books that contain no textual pages will be removed from the statistics.

In [4]:
rounded_page_counter = Counter()
list_of_page_numbers = []
for result in results:
    page_count = 0
    # Do not add page counts of books that have no text.
    if result['language'] == None:
        continue 
    else:
        for language_details in result['language_extras'].values():
            page_count += int(language_details['count'])
        list_of_page_numbers.append(page_count)
        page_count = (page_count // 10) * 10
        rounded_page_counter.update([page_count])

total_page_count = sum(rounded_page_counter.values())
df_number_pages = []
page_counts = []
page_count_percentage = []
for number_pages, page_count in rounded_page_counter.items():
    df_number_pages.append(number_pages)
    page_counts.append(page_count)
    page_count_percentage.append(round((page_count / total_page_count) * 100, 2))

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    page_count_df = pd.DataFrame({'Number pages': df_number_pages, 'Count': page_counts, 'Percentage (%)': page_count_percentage})
    page_count_df = page_count_df.sort_values('Number pages')
    page_count_df['Cumulative Percentage (%)'] = page_count_df['Percentage (%)'].cumsum()
    page_count_df = page_count_df.set_index('Number pages')
    display(page_count_df.T)

Number pages,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,750,760,770,780,790,800,810,820,830,840,850,860,870,880,890,900,910,920,930,940,950,960,970,980,990,1000,1010,1020,1030,1040,1050,1060,1070,1080,1090,1100,1110,1120,1130,1140,1150,1160,1170,1180,1190,1200,1210,1220,1230,1240,1250,1260,1270,1280,1290,1300,1310,1320,1330,1340,1350,1360,1370,1380,1390,1400,1410,1420,1430,1440,1450,1460,1470,1480,1490,1500,1510,1520,1530,1540,1550,1560,1570,1580,1590,1600,1620,1640,1650,1670,1680,1690,1730,1740,1760,1770,1790,1820,1830,1910,1940,2170
Count,788.0,1626.0,1519.0,1360.0,1135.0,1032.0,1099.0,1096.0,947.0,894.0,855.0,757.0,832.0,786.0,687.0,748.0,701.0,679.0,714.0,751.0,737.0,764.0,819.0,932.0,1027.0,1300.0,1218.0,1512.0,1671.0,1999.0,2270.0,2435.0,1811.0,1554.0,1242.0,1243.0,1026.0,904.0,945.0,898.0,820.0,842.0,796.0,759.0,762.0,674.0,654.0,673.0,651.0,612.0,537.0,566.0,512.0,491.0,434.0,378.0,378.0,387.0,345.0,389.0,294.0,305.0,295.0,301.0,299.0,272.0,241.0,253.0,243.0,207.0,176.0,175.0,163.0,149.0,125.0,132.0,137.0,137.0,127.0,139.0,138.0,170.0,128.0,125.0,110.0,114.0,95.0,99.0,82.0,99.0,76.0,92.0,86.0,81.0,87.0,87.0,77.0,59.0,53.0,54.0,59.0,63.0,44.0,46.0,44.0,37.0,37.0,36.0,46.0,34.0,30.0,25.0,26.0,25.0,25.0,18.0,21.0,18.0,11.0,21.0,15.0,20.0,15.0,9.0,12.0,10.0,15.0,16.0,7.0,10.0,5.0,6.0,1.0,6.0,6.0,5.0,4.0,4.0,6.0,5.0,4.0,3.0,6.0,3.0,2.0,6.0,3.0,3.0,3.0,5.0,1.0,2.0,4.0,3.0,2.0,2.0,4.0,2.0,1.0,2.0,3.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Percentage (%),1.23,2.54,2.37,2.13,1.77,1.61,1.72,1.71,1.48,1.4,1.34,1.18,1.3,1.23,1.07,1.17,1.1,1.06,1.12,1.17,1.15,1.19,1.28,1.46,1.61,2.03,1.9,2.36,2.61,3.12,3.55,3.81,2.83,2.43,1.94,1.94,1.6,1.41,1.48,1.4,1.28,1.32,1.24,1.19,1.19,1.05,1.02,1.05,1.02,0.96,0.84,0.88,0.8,0.77,0.68,0.59,0.59,0.6,0.54,0.61,0.46,0.48,0.46,0.47,0.47,0.43,0.38,0.4,0.38,0.32,0.28,0.27,0.25,0.23,0.2,0.21,0.21,0.21,0.2,0.22,0.22,0.27,0.2,0.2,0.17,0.18,0.15,0.15,0.13,0.15,0.12,0.14,0.13,0.13,0.14,0.14,0.12,0.09,0.08,0.08,0.09,0.1,0.07,0.07,0.07,0.06,0.06,0.06,0.07,0.05,0.05,0.04,0.04,0.04,0.04,0.03,0.03,0.03,0.02,0.03,0.02,0.03,0.02,0.01,0.02,0.02,0.02,0.03,0.01,0.02,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cumulative Percentage (%),1.23,3.77,6.14,8.27,10.04,11.65,13.37,15.08,16.56,17.96,19.3,20.48,21.78,23.01,24.08,25.25,26.35,27.41,28.53,29.7,30.85,32.04,33.32,34.78,36.39,38.42,40.32,42.68,45.29,48.41,51.96,55.77,58.6,61.03,62.97,64.91,66.51,67.92,69.4,70.8,72.08,73.4,74.64,75.83,77.02,78.07,79.09,80.14,81.16,82.12,82.96,83.84,84.64,85.41,86.09,86.68,87.27,87.87,88.41,89.02,89.48,89.96,90.42,90.89,91.36,91.79,92.17,92.57,92.95,93.27,93.55,93.82,94.07,94.3,94.5,94.71,94.92,95.13,95.33,95.55,95.77,96.04,96.24,96.44,96.61,96.79,96.94,97.09,97.22,97.37,97.49,97.63,97.76,97.89,98.03,98.17,98.29,98.38,98.46,98.54,98.63,98.73,98.8,98.87,98.94,99.0,99.06,99.12,99.19,99.24,99.29,99.33,99.37,99.41,99.45,99.48,99.51,99.54,99.56,99.59,99.61,99.64,99.66,99.67,99.69,99.71,99.73,99.76,99.77,99.79,99.8,99.81,99.81,99.82,99.83,99.84,99.85,99.86,99.87,99.88,99.89,99.89,99.9,99.9,99.9,99.91,99.91,99.91,99.91,99.92,99.92,99.92,99.93,99.93,99.93,99.93,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94,99.94


In [5]:
pd.Series(list_of_page_numbers).describe()

count    63981.000000
mean       325.985058
std        229.184900
min          1.000000
25%        158.000000
50%        304.000000
75%        432.000000
max       2173.000000
dtype: float64

As we can see the mean number of pages in a book is around 325 and 50% of books have between 158 and 432 pages.

However we can see that around 10.04% of all books have less than 50 pages, which I found unusal, but more unusual is that 1.23% of books have less than 10 pages. Below we plot the number of books by page count for books with less than 10 pages:



In [6]:
page_counts_less_than_10 = [page_number for page_number in list_of_page_numbers if page_number < 10]
for page_number, count in sorted(Counter(page_counts_less_than_10).items()):
    print(f'Number pages: {page_number}, Count: {count}')

Number pages: 1, Count: 20
Number pages: 2, Count: 21
Number pages: 3, Count: 56
Number pages: 4, Count: 100
Number pages: 5, Count: 41
Number pages: 6, Count: 84
Number pages: 7, Count: 166
Number pages: 8, Count: 206
Number pages: 9, Count: 94


From this we can see that 20 books have 1 page, which is unusual I think for a book, the identifiers for the books and their volumne number can be seen below:

In [8]:
identifier_for_1_page_books = []
volumne_number_for_1_page_books = []
for result in results:
    page_count = 0
    if result['language'] == None:
        continue
    for language_details in result['language_extras'].values():
        page_count += int(language_details['count'])
    if page_count == 1:
        file_name = result['filename']
        identifier, volumne, _ = file_name.split('_')
        identifier_for_1_page_books.append(identifier)
        volumne_number_for_1_page_books.append(volumne)

for identifier, volumne_number in zip(identifier_for_1_page_books, volumne_number_for_1_page_books):
    print(f'Identifier: {identifier}, Volumne number: {volumne_number}')

Identifier: 000610007, Volumne number: 01
Identifier: 000624024, Volumne number: 02
Identifier: 000667603, Volumne number: 01
Identifier: 000667614, Volumne number: 01
Identifier: 001246393, Volumne number: 01
Identifier: 000396864, Volumne number: 01
Identifier: 001539973, Volumne number: 01
Identifier: 002296883, Volumne number: 01
Identifier: 000104417, Volumne number: 01
Identifier: 002895827, Volumne number: 01
Identifier: 003836687, Volumne number: 01
Identifier: 003859321, Volumne number: 01
Identifier: 002227296, Volumne number: 01
Identifier: 002244147, Volumne number: 01
Identifier: 002244151, Volumne number: 01
Identifier: 003275369, Volumne number: 01
Identifier: 003448079, Volumne number: 01
Identifier: 000522164, Volumne number: 01
Identifier: 001944271, Volumne number: 01
Identifier: 003052245, Volumne number: 01


Exploring the British Library collection online:

1. 000522164 -- is "In Memory of Prince Albert Victor Edward of England, Duke of Clarence, died 14 Jan. 1892. [Verses.]", which is indeed one page long and can be found [here](http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=2&tabs=moreTab&ct=display&fn=search&doc=BLL01014608183&indx=1&recIds=BLL01014608183&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=2&frbg=&&dscnt=0&scp.scps=scope%3A%28BLCONTENT%29&vl(2084770704UI0)=any&tb=t&vid=BLVU1&mode=Basic&srt=rank&tab=local_tab&dum=true&vl(freeText0)=In%20Memory%20of%20Prince%20Albert%20Victor%20Edward%20of%20England%2C%20Duke%20of%20Clarence%2C%20died%2014%20Jan.%201892.&dstmp=1627839394107)
2. 000396864 -- is "Song. The Life of Love is but a Day", which again is one page long and can be found [here](http://explore.bl.uk/primo_library/libweb/action/display.do?frbrVersion=2&tabs=moreTab&ct=display&fn=search&doc=BLL01014635279&indx=1&recIds=BLL01014635279&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=2&frbg=&&dscnt=0&scp.scps=scope%3A%28BLCONTENT%29&vl(2084770704UI0)=any&tb=t&vid=BLVU1&mode=Basic&srt=rank&tab=local_tab&dum=true&vl(freeText0)=Song.%20The%20Life%20of%20Love%20is%20but%20a%20Day&dstmp=1627839546003)
3. 000624024 -- is "Dieci anni in Equatoria e ritorno con Emin Pascia ... Con ... illustrazioni e ... carte, etc. [With a portrait.]']", but as it is in volume 2 I cannot verify if it is one page as I can only find volume one [online](http://explore.bl.uk/primo_library/libweb/action/display.do?tabs=moreTab&ct=display&fn=search&doc=BLL01014839572&indx=1&recIds=BLL01014839572&recIdxs=0&elementId=0&renderMode=poppedOut&displayMode=full&frbrVersion=&dscnt=0&vl(2084770704UI0)=any&mode=Basic&vid=BLVU1&tab=available_online&dstmp=1627842397007&frbg=&frbrVersion=&viewAllItemsClicked=false&scp.scps=scope%3A%28BLWEBSITE%29%2Cscope%3A%28BLO_WA%29%2Cscope%3A%28BLO_Aleph%29%2Cscope%3A%28BLO_SFX%29%2Cscope%3A%28BLO_SAMI%29&tb=t&srt=rank&dum=true&selectedLocation=&vl(freeText0)=Dieci%20anni%20in%20Equatoria%20e%20ritorno%20con%20Emin%20Pascia%20...%20Con%20...%20illustrazioni%20e%20...%20carte%2C%20etc.%20%5BWith%20a%20portrait.).

## Page length and the affect it may cause on language identification

Having fewer pages in a book might be affecting the outcome of the language identification model. To test this we are going to group the books by page counts, where page counts will be round down to the neareast 10 pages e.g. less than 9 pages will be grouped to 0 and less than 19 grouped to 10, this grouping will be done for all book with less than 50 pages. Once grouped we will see that the distribution of languages are for those page count groups:

In [9]:
def filter_books_by_page_count(books: List[Dict[str, Any]], 
                               min_number_pages: int,
                               max_number_pages: int
                               ) -> List[Dict[str, Any]]:
    filtered_results: List[Dict[str, Any]] = []
    for book in books:
        page_count = 0
        if book['language'] == None:
            continue
        for language_details in book['language_extras'].values():
            page_count += int(language_details['count'])
        if (page_count >= min_number_pages) and (page_count <= max_number_pages):
            filtered_results.append(book)
    return filtered_results

def books_by_page_count_group(books: List[Dict[str, Any]], min_page_count: int, 
                              max_page_count: int) -> List[str]:
    language_of_books: List[str] = []
    for book in books:
        page_count = 0
        if book['language'] == None:
            continue
        for language_details in book['language_extras'].values():
            page_count += int(language_details['count'])
        if (page_count >= min_page_count) and (page_count <= max_page_count):
            language_of_books.append(book['language'])
    return language_of_books

filtered_results = filter_books_by_page_count(results, 0, 49)

### 0-9 pages

In [10]:

create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 0, 9)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,93.15,734
1,French,2.03,16
2,Italian,1.14,9
3,German,1.02,8
4,Spanish,1.02,8
5,Latin,0.51,4
6,Russian,0.13,1
7,Finnish,0.13,1
8,Greek,0.13,1
9,Dutch,0.13,1


### 10-19 pages

In [11]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 10, 19)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,89.24,1451
1,French,4.55,74
2,German,2.58,42
3,Italian,1.48,24
4,Dutch,0.62,10
5,Spanish,0.43,7
6,Russian,0.31,5
7,Latin,0.25,4
8,Polish,0.18,3
9,Portuguese,0.12,2


### 20-29 pages

In [12]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 20, 29)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,86.44,1313
1,French,5.33,81
2,German,3.75,57
3,Italian,1.78,27
4,Spanish,0.79,12
5,Latin,0.46,7
6,Dutch,0.39,6
7,Russian,0.26,4
8,Swedish,0.26,4
9,Greek,0.2,3


### 30-39 pages

In [13]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 30, 39)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,85.96,1169
1,German,4.41,60
2,French,4.04,55
3,Italian,1.99,27
4,Spanish,0.96,13
5,Dutch,0.74,10
6,Russian,0.51,7
7,Latin,0.51,7
8,Polish,0.22,3
9,Swedish,0.22,3


### 40-49 pages

In [14]:
create_language_dataframe(Counter(books_by_page_count_group(filtered_results, 40, 49)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,84.49,959
1,German,4.76,54
2,French,4.32,49
3,Italian,1.59,18
4,Dutch,1.15,13
5,Spanish,0.97,11
6,Latin,0.88,10
7,Russian,0.44,5
8,Danish,0.44,5
9,Swedish,0.35,4


### All other pages

In [15]:
create_language_dataframe(Counter(books_by_page_count_group(results, 50, 2000)))

Unnamed: 0,Language,Percentage (%),Number of books
0,English,75.31,43342
1,French,8.63,4969
2,German,7.05,4056
3,Spanish,1.56,897
4,Dutch,1.38,792
5,Russian,1.35,778
6,Italian,1.33,768
7,Hungarian,0.68,391
8,Swedish,0.62,356
9,Danish,0.58,333


### Conclusion of page length affecting language identification

We can see that the language idenitifcation distribution does change a bit, whereby the identification model appears to identify more English books, however this could be the case that there are more English books with fewer pages. Lastly this type of analysis does not determine that the language identification model is more accurate on fewer page it just shows that there is nothing abnormal occuring in books with fewer pages such as OCR errors on the title and index pages (which might make up a larger percentage of the book) affecting the language idenitification result. 