In [1]:
from os.path import join
import pandas as pd
from constants import *

# 2011
------------

<br>

### DATASETS

BLOGS: 120,000 articles

BOOKS: 1,000 books

NEWS: 29,000 articles

PUBMED: 77,000 articles

<br>

### TOPIC MODELING pipeline:

- token -> lemma -> stop -> filter out token w/ count < 10 in vocabulary 
- BOW for each document
- 100 topic models
- average PMI score for each topic (Newman 2010b) -> filter out topics w/ score < 0.4
- filter out topics w/ count(token: token default nominal in Wikipedia) < 5

<br>

### RESULTS

| **topics**        | **dataset**       |
| ------------- |:-------------:|
| 45            | BLOGS         |
| 38            | BOOKS         |
| 60            | NEWS          |
| 85            | PUBMED        |
| *228*            | *sum*         |


=> 6000 labels (~27 per topic)

<br> 

#### annotation task:

per topic:
- 10 top topic terms
- 10 label suggestions
- 4 answer options

10 annotations (at least) per label candidate

-> filtered malitious annotators

<br>

45,533 label ratings => ~4,500 successfull HITs (Amazon Mechanical Turk)



In [2]:
data_2011 = '../data/topiclabel/2011'
f_topics2011 = join(data_2011, 'topics.csv')
f_annotation2011 = join(data_2011, 'topiclabels.csv')
readme = join(data_2011, 'README.txt')
topics2011 = pd.read_csv(f_topics2011)
annotation2011 = pd.read_csv(f_annotation2011)

print(annotation2011.iloc[:, 2:].count().sum())
examples = ['2008 summer olympics', 'gothic architecture', 'israeli–palestinian conflict', 'immune system']
mask11 = annotation2011.label.isin(examples)
annotation2011['mean'] = annotation2011.loc[mask11, 'rate0':].mean(axis=1)
annotation2011.loc[mask11]

45533


Unnamed: 0,topic_id,label,rate0,rate1,rate2,rate3,rate4,rate5,rate6,rate7,rate8,rate9,rate10,rate11,rate12,rate13,mean
244,6,immune system,0,1,1,1.0,1.0,2.0,2.0,,,,,,,,1.142857
323,9,2008 summer olympics,2,2,3,3.0,3.0,,,,,,,,,,2.6
1551,45,gothic architecture,2,2,2,3.0,3.0,,,,,,,,,,2.4
1948,61,gothic architecture,1,1,2,2.0,2.0,,,,,,,,,,1.6
2334,75,gothic architecture,1,1,2,2.0,2.0,2.0,,,,,,,,,1.666667
2592,84,israeli–palestinian conflict,2,2,2,3.0,3.0,3.0,3.0,3.0,,,,,,,2.625
2662,86,immune system,1,1,1,1.0,1.0,2.0,,,,,,,,,1.166667
4615,145,immune system,0,1,2,2.0,2.0,3.0,,,,,,,,,1.666667
4645,146,immune system,1,2,2,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,,,,2.363636
6000,195,immune system,1,1,1,2.0,2.0,,,,,,,,,,1.4


In [3]:
annotation2011[annotation2011.duplicated('label', keep=False)].sort_values('label')

Unnamed: 0,topic_id,label,rate0,rate1,rate2,rate3,rate4,rate5,rate6,rate7,rate8,rate9,rate10,rate11,rate12,rate13,mean
258,7,1/2 cup,1,1,1,1.0,2.0,,,,,,,,,,
381,11,1/2 cup,0,0,0,0.0,0.0,0.0,,,,,,,,,
2740,89,1/2 cup,0,0,0,0.0,1.0,1.0,,,,,,,,,
3321,105,1/2 cup,0,0,0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,,,,,
259,7,1/4 cup,1,1,1,1.0,2.0,,,,,,,,,,
382,11,1/4 cup,0,0,0,0.0,0.0,0.0,0.0,,,,,,,,
2741,89,1/4 cup,0,0,0,0.0,0.0,1.0,1.0,,,,,,,,
3323,105,1/4 cup,0,0,1,1.0,1.0,,,,,,,,,,
3898,122,1960s in fashion,0,1,1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,,,
1138,34,1960s in fashion,1,1,1,1.0,1.0,2.0,,,,,,,,,


In [5]:
print(topics2011.shape[0])
topics2011.iloc[[7, 11, 89, 105]]

228


Unnamed: 0,topic_id,domain,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9
7,7,blogs,food,eat,cook,chicken,recipe,cup,cheese,add,taste,tomato
11,11,blogs,look,white,color,black,red,hair,blue,light,green,eye
89,89,news,food,wine,restaurant,eat,drink,bottle,beer,coffee,bar,ice
105,105,news,cup,cook,minute,add,pepper,salt,serve,tablespoon,oil,sauce


# 2016
------

<br>

### DATASETS and annotation

- same datasets and topics as 2011
- very similar annotation method
- annotation quality controlled by 2011 dataset mean ratings

<br>

- 19 top candidates from unsupervised relevance ranking
- 10 annotations per label candidate (pre-filtering)
- 6.4 annotations per label candidate on average (post-filtering)


<br> 

#### annotation task:

per topic:
- 10 top topic terms
- 10 label suggestions
- 4 answer options

<br>

27,788 label ratings => ~2,800 successfull HITs (CrowdFlower)

<br>

compute mean of ratings and rank accordingly


In [6]:
data_2016 = '../data/topiclabel/2016'
f_topics2016 = join(data_2016, 'topics.csv')
f_annotation2016 = join(data_2016, 'annotated_dataset.csv')
topics2016 = pd.read_csv(f_topics2016)
annotation2016 = pd.read_csv(f_annotation2016, sep='\t')

print(annotation2016.iloc[:, 2:].count().sum())
mask11 = annotation2016.label.isin(examples)
annotation2016['mean'] = annotation2016.loc[mask11, 'annotator1':].mean(axis=1)
annotation2016.loc[mask11]

27788


Unnamed: 0,label,topic_id,annotator1,annotator2,annotator3,annotator4,annotator5,annotator6,annotator7,annotator8,annotator9,annotator10,mean
175,2008 summer olympics,9.0,3,3,2.0,2.0,,,,,,,2.5
855,gothic architecture,45.0,3,1,2.0,1.0,3.0,1.0,2.0,,,,1.857143
1177,gothic architecture,61.0,1,0,2.0,2.0,2.0,3.0,,,,,1.666667
1430,gothic architecture,75.0,3,2,3.0,,,,,,,,2.666667
2027,immune system,106.0,0,0,0.0,2.0,1.0,,,,,,0.6
2783,immune system,146.0,2,3,3.0,2.0,1.0,3.0,2.0,,,,2.285714
2833,immune system,149.0,1,1,2.0,2.0,1.0,1.0,,,,,1.333333
2899,immune system,152.0,2,3,2.0,1.0,1.0,,,,,,1.8
2960,immune system,155.0,1,2,1.0,2.0,1.0,,,,,,1.4
2989,immune system,157.0,2,2,2.0,2.0,1.0,2.0,,,,,1.833333


In [7]:
annotation2016[annotation2016.duplicated('label', keep=False)].sort_values('label')

Unnamed: 0,label,topic_id,annotator1,annotator2,annotator3,annotator4,annotator5,annotator6,annotator7,annotator8,annotator9,annotator10,mean
683,a woman,35.0,1,2,1.0,1.0,1.0,2.0,,,,,
1383,a woman,72.0,2,2,2.0,0.0,1.0,0.0,0.0,,,,
2680,a woman,141.0,3,1,2.0,,,,,,,,
656,a woman,34.0,3,2,1.0,2.0,2.0,0.0,0.0,,,,
3404,abdominal surgery,179.0,2,2,1.0,2.0,0.0,1.0,,,,,
1018,abdominal surgery,53.0,2,1,1.0,0.0,3.0,3.0,2.0,2.0,,,
3692,acetylcholine,194.0,2,1,2.0,2.0,1.0,0.0,2.0,2.0,,,
2883,acetylcholine,151.0,2,1,3.0,0.0,2.0,,,,,,
3512,acetylcholine,184.0,2,3,2.0,2.0,2.0,1.0,1.0,1.0,,,
3853,acetylcysteine,202.0,2,0,1.0,1.0,1.0,1.0,1.0,,,,


In [8]:
print(topics2016.shape[0])
topics2016.iloc[[35, 72, 141, 34]]

228


Unnamed: 0,topic_id,domain,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9
35,35,blogs,child,woman,family,kid,parent,baby,mother,home,husband,young
72,72,iabooks,mr.,mrs.,young,lady,look,friend,tell,mother,miss,father
141,141,news,woman,ms.,sex,sexual,male,female,age,bill,begin,husband
34,34,blogs,wear,baby,dress,wedding,look,love,clothes,girl,kid,need
