# Gender in norwegian novels

* How to find the distribution of gender in novels
* Are females more likely to be referred to again than males?


In [1]:
# uncomment the line below (remove #) and run of gender-guesser is not installed
#!pip install gender-guesser

In [2]:
import gender_guesser.detector as gender
import dhlab.module_update as mu
import dhlab.nbtext as nb
import requests
import pandas as pd
import json
from collections import Counter
mu.update("wordbank")
import wordbank as wb
mu.css()

Updated file `C:\Users\larsj\Documents\GitHub\LingPhil-course-2021\wordbank.py`

In [3]:
%%HTML
<style>
p {font-size:1.5em}
</style>

In [4]:
detect = gender.Detector()

## Build a corpus using metadata

search using author dewey, subject translation etc.

In [5]:
nb.book_corpus(author="hjort%vigdis%")

Unnamed: 0,urn,author,title,year
0,2013092606085,"Hjorth, Vigdis",Fordeler og ulemper ved å være til,2005
1,2008030300078,"Hjorth, Vigdis",Død sheriff,1995
2,2009042004056,"Hjorth, Vigdis",Hysj,1997
3,2011032508155,"Hjorth, Vigdis",17.15 til Tønsberg,2004
4,2016070748067,"Hjorth, Vigdis",Jørgen + Anne er sant,1986
5,2014012906003,"Hjorth, Vigdis",Hjulskift,2009
6,2011040808036,"Hjorth, Vigdis",17.15 til Tønsberg,2003
7,2014051906169,"Hjorth, Vigdis",Tredve dager i Sandefjord,2011
8,2009052704021,"Hjorth, Vigdis",Jørgen + Anne er sant,2003
9,2008072204021,"Hjorth, Vigdis",Død sheriff,1996


### Look up metadata

In [27]:
bokvalg = 2008072204021

In [28]:
nb.metadata(bokvalg)

[[2008072204021,
  'Hjorth, Vigdis',
  1996,
  'Død sheriff',
  'roman',
  'Cappelen',
  'fiction',
  'a66ba223efc91e91ca04fdc380db7909',
  'nob',
  '']]

### Collect the frequency for this book

In [29]:
book = nb.frame(nb.get_freq(bokvalg, top=0, cutoff=0))
book.head(20)

Unnamed: 0,0
?,13560
",",4548
.,2587
og,2075
det,1446
jeg,1359
i,1278
p,1218
var,1027
ikke,822


### Initial gender distribution with pronouns

Her with the accusative forms - most frequent

In [30]:
book.loc[['han', 'hun']]

Unnamed: 0,0
han,550
hun,577


### Find words with capital letters

Heuristics for a name candidate:

1. Starts with a capital letter
1. Only first letter is capital
1. It won't occur without a capital letter

In [31]:
capitals = [x for x in book.index 
            if x.upper()[0] == x[0] 
            and x.upper() != x 
            and x.isalpha() 
            and not x.lower() in book.index]

### Take a quick look at wordbank

In [32]:
wb.word_form_many(capitals)

[]

In [33]:
wb.word_form_many(['Ask', "Per", "Lars", "Bjørn", "bjørn"])

[['Per', 'subst mask appell ent ub normert'],
 ['Per', 'subst mask prop normert'],
 ['bjørn', 'subst mask appell ent ub normert']]

### Collect gender data for words in the book

In [34]:
gender_data = [(c, detect.get_gender(c)) for c in capitals]

In [35]:
gf = pd.DataFrame(gender_data, columns = ['name', 'gender']).set_index('name')
gf.head(30)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Goran,male
Jorun,female
Atle,male
Lilly,female
Margot,female
Ulf,male
Morten,male
Bergljot,female
Judith,female
Kjell,male


### Count the different males

In [36]:
gf[gf['gender'] == 'female'].count()

gender    28
dtype: int64

In [37]:
gf[gf['gender'] == 'male'].count()

gender    46
dtype: int64

### Find names

In [38]:
gf[gf['gender'] == 'male'].head(20)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Goran,male
Atle,male
Ulf,male
Morten,male
Kjell,male
Ingvar,male
Lars,male
Peter,male
Marshall,male
Roy,male


### Fetch some numbers

Have a look at transfer to variables

In [39]:
book.loc[['han', 'hun', 'jeg']]

Unnamed: 0,0
han,550
hun,577
jeg,1359


In [40]:
han = book.loc['han']
hun = book.loc['hun']
jeg = book.loc['jeg']

### Count the occurrences

Each name occurs a couple of times. Here we count how often the different males occur. First. let's have a look at the dataframe of male names, the frequency list. Note possible source of error, first name and last name may count double

In [41]:
book.loc[gf.index].head(20)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Goran,198
Jorun,157
Atle,110
Lilly,108
Margot,106
Ulf,104
Morten,87
Bergljot,82
Judith,72
Kjell,64


### Sum up males and females

In [42]:
males = book.loc[gf[gf['gender'] == 'male'].index].sum()
males

0    660
dtype: int64

In [43]:
females = book.loc[gf[gf['gender'] == 'female'].index].sum()
females

0    660
dtype: int64

### Compare with the pronouns

In [44]:
males/females

0    1.0
dtype: float64

In [45]:
han/hun

0    0.953206
dtype: float64

### Greater chance of referring to males than females

In [46]:
han/males

0    0.833333
dtype: float64

In [47]:
hun/females

0    0.874242
dtype: float64

## Exercise

Change the metadata and choose a different book