# Gender in norwegian novels

* How to find the distribution of gender in novels
* Are females more likely to be referred to again than males?


In [1]:
# uncomment the line below (remove #) and run of gender-guesser is not installed
#!pip install gender-guesser

In [2]:
import gender_guesser.detector as gender
import dhlab.module_update as mu
import dhlab.nbtext as nb
import requests
import pandas as pd
import json
from collections import Counter
mu.update("wordbank")
import wordbank as wb
mu.css()

Updated file `C:\Users\larsj\Documents\GitHub\LingPhil-course-2021\wordbank.py`

In [3]:
%%HTML
<style>
p {font-size:1.5em}
</style>

In [4]:
detect = gender.Detector()

## Build a corpus using metadata

search using author dewey, subject translation etc.

In [29]:
nb.book_corpus(author="jacobsen%roy%")

Unnamed: 0,urn,author,title,year
0,2014090908205,"Jacobsen, Roy",Hoggerne,2006
1,2013010806024,"Jacobsen, Roy",Marions slør,2007
2,2008011404093,"Jacobsen, Roy",Seierherrene,1993
3,2010110808153,"Jacobsen, Roy",Ursula,1990
4,2014101408132,"Jacobsen, Roy",Seierherrene,2009
5,2010110908095,"Jacobsen, Roy",Tommy,1990
6,2010121406046,"Jacobsen, Roy",Virgo,1992
7,2014052108045,"Jacobsen, Roy",Det nye vinduet,2004
8,2012062205031,"Jacobsen, Roy",Hjertetrøbbel,1984
9,2008112400050,"Jacobsen, Roy",Fugler og soldater,2001


### Look up metadata

In [30]:
bokvalg = 2008011404093

In [31]:
nb.metadata(bokvalg)

[[2008011404093,
  'Jacobsen, Roy',
  1993,
  'Seierherrene',
  '',
  'Cappelen',
  'fiction',
  '3e9b300b3744bbd289b5e7a9abb4b153',
  'nob',
  '']]

### Collect the frequency for this book

In [32]:
book = nb.frame(nb.get_freq(bokvalg, top=0, cutoff=0))
book.head(20)

Unnamed: 0,0
",",15402
og,7893
.,6941
i,5082
det,4973
på,4126
som,3823
-,3769
er,3231
ikke,3052


### Initial gender distribution with pronouns

Her with the accusative forms - most frequent

In [33]:
book.loc[['han', 'hun']]

Unnamed: 0,0
han,2091
hun,1164


### Find words with capital letters

Heuristics for a name candidate:

1. Starts with a capital letter
1. Only first letter is capital
1. It won't occur without a capital letter

In [34]:
capitals = [x for x in book.index 
            if x.upper()[0] == x[0] 
            and x.upper() != x 
            and x.isalpha() 
            and not x.lower() in book.index]

### Take a quick look at wordbank

In [35]:
wb.word_form_many(capitals)

[['Mo', 'symb subst normert'],
 ['Diss', 'subst mask appell ent ub normert'],
 ['LPene', 'subst mask appell fl be normert'],
 ['Fortran', 'subst prop normert'],
 ['Fm', 'symb subst normert'],
 ['Co', 'symb subst normert'],
 ['Am', 'symb subst normert']]

In [36]:
wb.word_form_many(['Ask', "Per", "Lars", "Bjørn", "bjørn"])

[['Per', 'subst mask appell ent ub normert'],
 ['Per', 'subst mask prop normert'],
 ['bjørn', 'subst mask appell ent ub normert']]

### Collect gender data for words in the book

In [37]:
gender_data = [(c, detect.get_gender(c)) for c in capitals]

In [38]:
gf = pd.DataFrame(gender_data, columns = ['name', 'gender']).set_index('name')
gf.head(30)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Marta,female
Harald,male
Jannik,mostly_male
Rogern,unknown
Frank,male
Gunnar,male
Snefnugget,unknown
Arvid,male
Liljan,male
Sigøyneren,unknown


### Count the different males

In [39]:
gf[gf['gender'] == 'female'].count()

gender    60
dtype: int64

In [40]:
gf[gf['gender'] == 'male'].count()

gender    177
dtype: int64

### Find names

In [41]:
gf[gf['gender'] == 'male'].head(20)

Unnamed: 0_level_0,gender
name,Unnamed: 1_level_1
Harald,male
Frank,male
Gunnar,male
Arvid,male
Liljan,male
Johan,male
Ove,male
Peer,male
Jan,male
Oscar,male


### Fetch some numbers

Have a look at transfer to variables

In [42]:
book.loc[['han', 'hun', 'jeg']]

Unnamed: 0,0
han,2091
hun,1164
jeg,1471


In [43]:
han = book.loc['han']
hun = book.loc['hun']
jeg = book.loc['jeg']

### Count the occurrences

Each name occurs a couple of times. Here we count how often the different males occur. First. let's have a look at the dataframe of male names, the frequency list. Note possible source of error, first name and last name may count double

In [44]:
book.loc[gf.index].head(20)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Marta,545
Harald,446
Jannik,336
Rogern,198
Frank,146
Gunnar,133
Snefnugget,132
Arvid,122
Liljan,106
Sigøyneren,102


### Sum up males and females

In [45]:
males = book.loc[gf[gf['gender'] == 'male'].index].sum()
males

0    2138
dtype: int64

In [46]:
females = book.loc[gf[gf['gender'] == 'female'].index].sum()
females

0    1072
dtype: int64

### Compare with the pronouns

In [47]:
males/females

0    1.994403
dtype: float64

In [48]:
han/hun

0    1.796392
dtype: float64

### Greater chance of referring to males than females

In [49]:
han/males

0    0.978017
dtype: float64

In [50]:
hun/females

0    1.085821
dtype: float64

## Exercise

Change the metadata and choose a different book