# Gender in norwegian novels

* How to find the distribution of gender in novels
* Are females more likely to be referred to again than males?


In [1]:
# uncomment the line below (remove #) and run of gender-guesser is not installed
#!pip install gender-guesser

In [3]:
import gender_guesser.detector as gender
import dhlab.module_update as mu
import dhlab.nbtext as nb
import requests
import pandas as pd
import json
from collections import Counter
mu.update("wordbank")
import wordbank as wb
mu.css()

Updated file `/mnt/disk1/Github/LingPhil-course-2021/wordbank.py`

In [4]:
%%HTML
<style>
p {font-size:1.5em}
</style>

In [5]:
detect = gender.Detector()

## Build a corpus using metadata

search using author dewey, subject translation etc.

In [6]:
nb.book_corpus(author="knaus%karl%")

Unnamed: 0,urn,author,title,year
0,2012052905085,"Knausgård, Karl Ove",Ute av verden,2006
1,2012112638153,"Knausgård, Karl Ove",Min kamp,2011
2,2013082806088,"Knausgård, Karl Ove",Min kamp,2010
3,2011090606104,"Knausgård, Karl Ove",Ute av verden,2005
4,2014013105041,"Knausgård, Karl Ove",Min kamp,2009
5,2012111508072,"Knausgård, Karl Ove",Min kamp,2011
6,2013091306040,"Knausgård, Karl Ove",Min kamp,2009
7,2008091001001,"Knausgård, Karl Ove",Ute av verden,1999
8,2014120108004,"Knausgård, Karl Ove",Ute av verden,2002
9,2012111508022,"Knausgård, Karl Ove",Min kamp,2011


### Look up metadata for `2012112638153`

In [9]:
nb.metadata(2012112638153)

[[2012112638153,
  'Knausgård, Karl Ove',
  2011,
  'Min kamp',
  'roman',
  'Oktober;Oslo : Oktober, 2009-2011',
  'novel',
  'bf0b43404b0670299aa63f1d7897b268',
  'nob',
  '']]

### Collect the frequency for this book

In [10]:
book = nb.frame(nb.get_freq(2012112638153, top=0, cutoff=0))
book.head(20)

Unnamed: 0,0
",",37246
.,21018
det,15460
og,15363
i,9999
som,9011
jeg,8107
var,7246
er,6598
på,6577


### Initial gender distribution with pronouns

Her with the accusative forms - most frequent

In [11]:
book.loc[['han', 'hun']]

Unnamed: 0,0
han,3768
hun,2373


### Find words with capital letters

Heuristics for a name candidate:

1. Starts with a capital letter
1. Only first letter is capital
1. It won't occur without a capital letter

In [12]:
capitals = [x for x in book.index 
            if x.upper()[0] == x[0] 
            and x.upper() != x 
            and x.isalpha() 
            and not x.lower() in book.index]

### Take a quick look at wordbank

In [13]:
wb.word_form_many(capitals)

[['Herren', 'subst prop normert'],
 ['Vårherre', 'subst prop normert'],
 ['Østen', 'subst nøyt prop normert'],
 ['Østen', 'subst prop normert'],
 ['Mosebøkene', 'subst fem appell fl be normert'],
 ['Mosebøkene', 'subst mask appell fl be normert'],
 ['Pascal', 'subst prop normert'],
 ['Ola', 'subst mask prop normert'],
 ['Na', 'symb subst normert'],
 ['Moseloven', 'subst mask prop normert'],
 ['Mosebok', 'subst fem appell ent ub normert'],
 ['Mosebok', 'subst mask appell ent ub normert'],
 ['Kr', 'symb subst normert'],
 ['Au', 'symb subst normert']]

In [14]:
wb.word_form_many(['Ask', "Per", "Lars", "Bjørn", "bjørn"])

[['Per', 'subst mask appell ent ub normert'],
 ['Per', 'subst mask prop normert'],
 ['bjørn', 'subst mask appell ent ub normert']]

### Collect gender data for words in the book

In [15]:
gender_data = [(c, detect.get_gender(c)) for c in capitals]

In [16]:
gf = pd.DataFrame(gender_data, columns = ['name', 'gender']).set_index('name')
gf.head(30)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Linda,female
Hitler,unknown
Vanja,andy
Heidi,female
Geir,male
John,male
Kubizek,unknown
Hitlers,unknown
Njaal,unknown
Gunnar,male


### Count the different males

In [17]:
gf[gf['gender'] == 'female'].count()

gender    138
dtype: int64

In [18]:
gf[gf['gender'] == 'male'].count()

gender    303
dtype: int64

### Find names

In [19]:
gf[gf['gender'] == 'male'].head(20)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Geir,male
John,male
Gunnar,male
Yngve,male
Karl,male
Adolf,male
Thomas,male
Tore,male
Hamlet,male
Moses,male


### Fetch some numbers

Have a look at transfer to variables

In [20]:
book.loc[['han', 'hun', 'jeg']]

Unnamed: 0,0
han,3768
hun,2373
jeg,8107


In [21]:
han = book.loc['han']
hun = book.loc['hun']
jeg = book.loc['jeg']

### Count the occurrences

Each name occurs a couple of times. Here we count how often the different males occur. First. let's have a look at the dataframe of male names, the frequency list. Note possible source of error, first name and last name may count double

In [22]:
book.loc[gf.index].head(20)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Linda,774
Hitler,604
Vanja,512
Heidi,456
Geir,359
John,297
Kubizek,172
Hitlers,172
Njaal,124
Gunnar,123


### Sum up males and females

In [23]:
males = book.loc[gf[gf['gender'] == 'male'].index].sum()
males

0    2151
dtype: int64

In [24]:
females = book.loc[gf[gf['gender'] == 'female'].index].sum()
females

0    1777
dtype: int64

### Compare with the pronouns

In [25]:
males/females

0    1.210467
dtype: float64

In [26]:
han/hun

0    1.587863
dtype: float64

### Greater chance of referring to males than females

In [27]:
han/males

0    1.751743
dtype: float64

In [28]:
hun/females

0    1.335397
dtype: float64

## Exercise

Change the metadata and choose a different book

In [40]:
detect.get_gender("Gisle")

'male'