# Descriptive analysis of GermEval 2017 data (latest version from 2017-09-15)

In [1]:
#!/usr/bin/env python3
import os
import pandas as pd
import numpy as np
from utils import flatten_list
from seqeval_metrics import get_entities
from data_prep import bio_tagging_df

df_path = "./data/"

# Load complete pre-processed data
train_df = pd.read_csv(df_path + "train_df.tsv", delimiter = '\t')
dev_df = pd.read_csv(df_path + "dev_df.tsv", delimiter = '\t')
test_syn_df = pd.read_csv(df_path + "test_syn_df.tsv", delimiter = '\t')
test_syn_df = test_syn_df.dropna(subset = ["text"])    
test_dia_df = pd.read_csv(df_path + "test_dia_df.tsv", delimiter = '\t')

train_df_op = pd.read_csv(df_path + "train_df_opinion.tsv", delimiter = '\t')
dev_df_op = pd.read_csv(df_path + "dev_df_opinion.tsv", delimiter = '\t')
test_syn_df_op = pd.read_csv(df_path + "test_syn_df_opinion.tsv", delimiter = '\t')
test_dia_df_op = pd.read_csv(df_path + "test_dia_df_opinion.tsv", delimiter = '\t')

train_df_cat = pd.read_csv(df_path + "train_df_cat.tsv", delimiter = '\t')
dev_df_cat = pd.read_csv(df_path + "dev_df_cat.tsv", delimiter = '\t')
test_syn_df_cat = pd.read_csv(df_path + "test_syn_df_cat.tsv", delimiter = '\t')
test_dia_df_cat = pd.read_csv(df_path + "test_dia_df_cat.tsv", delimiter = '\t')

train_df_cat_pol = pd.read_csv(df_path + "train_df_cat_pol.tsv", delimiter = '\t')
dev_df_cat_pol = pd.read_csv(df_path + "dev_df_cat_pol.tsv", delimiter = '\t')
test_syn_df_cat_pol = pd.read_csv(df_path + "test_syn_df_cat_pol.tsv", delimiter = '\t')
test_dia_df_cat_pol = pd.read_csv(df_path + "test_dia_df_cat_pol.tsv", delimiter = '\t')

cats = train_df_cat.columns[5:]
cats_pol = train_df_cat_pol.columns[5:]

FileNotFoundError: [Errno 2] No such file or directory: './data/train_df.tsv'

In [3]:
def count_bio_tags(df):
    df = bio_tagging_df(df)
    tags = get_entities(flatten_list(df.bio_tags))
    entities = [tag[0] for tag in tags]
    ent_dict = {'bio_tags': entities}
    entities_df = pd.DataFrame(data=ent_dict)
    report = entities_df['bio_tags'].value_counts()
    return report


def desc_analysis(df, df_op, df_cat, df_cat_pol, cats, cats_pol):
    print("complete number of observations")
    print(len(df))
    print()

    print("relevance distribution") #subtask A
    print(pd.crosstab(df["relevance"], columns = "count").sort_values(by = "count", ascending = False))
    print()

    print("sentiment distribution") # subtask B
    print(pd.crosstab(df["sentiment"], columns = "count").sort_values(by = "count", ascending = False))
    print()

    print("aspect distribution of documents with opinion") # Subtask C
    print(pd.crosstab(df_op["aspect"], columns = "count").sort_values(by = "count", ascending = False))
    print()
    print(pd.crosstab(df_op['aspect_polarity'], columns = "count").sort_values(by = "count", ascending = False))
    print("total documents: ", len(train_df_op))
    print()

    print("aspect distribution without multiple mentions")
    print(df_cat[cats].sum(axis = 0, skipna = True))
    print()
    print(df_cat_pol[cats_pol].sum(axis = 0, skipna = True))
    print()

    print("entity distribution") # Subtask D
    print(count_bio_tags(train_df_op))
    print()


## Train data

In [4]:
desc_analysis(train_df, train_df_op, train_df_cat, train_df_cat_pol, cats, cats_pol)



complete number of observations
19432

relevance distribution
col_0      count
relevance       
True       16201
False       3231

sentiment distribution
col_0      count
sentiment       
neutral    13208
negative    5045
positive    1179

aspect distribution of documents with opinion
col_0                         count
aspect                             
Allgemein                     12385
Zugfahrt                       2189
Sonstige_Unregelmässigkeiten   1918
Atmosphäre                     1405
Sicherheit                      859
Ticketkauf                      661
Service_und_Kundenbetreuung     504
Connectivity                    360
Informationen                   354
Auslastung_und_Platzangebot     272
DB_App_und_Website              228
Komfort_und_Ausstattung         148
Barrierefreiheit                 72
Toiletten                        49
Image                            48
Reisen_mit_Kindern               43
Gastronomisches_Angebot          43
Design                        

## Development data

In [5]:
desc_analysis(dev_df, dev_df_op, dev_df_cat, dev_df_cat_pol, cats, cats_pol)


complete number of observations
2369

relevance distribution
col_0      count
relevance       
True        1931
False        438

sentiment distribution
col_0      count
sentiment       
neutral     1632
negative     589
positive     148

aspect distribution of documents with opinion
col_0                         count
aspect                             
Allgemein                      1507
Zugfahrt                        232
Sonstige_Unregelmässigkeiten    194
Atmosphäre                      171
Sicherheit                      103
Ticketkauf                       80
Service_und_Kundenbetreuung      47
Informationen                    34
Auslastung_und_Platzangebot      32
Connectivity                     30
DB_App_und_Website               24
Komfort_und_Ausstattung          18
Barrierefreiheit                 17
Image                             6
Toiletten                         5
Gastronomisches_Angebot           4
Design                            4
Reisen_mit_Kindern             

## Synchronic test data

In [6]:
desc_analysis(test_syn_df, test_syn_df_op, test_syn_df_cat, test_syn_df_cat_pol, cats, cats_pol)


complete number of observations
2555

relevance distribution
col_0      count
relevance       
True        2095
False        460

sentiment distribution
col_0      count
sentiment       
neutral     1670
negative     780
positive     105

aspect distribution of documents with opinion
col_0                         count
aspect                             
Allgemein                      1573
Sonstige_Unregelmässigkeiten    373
Zugfahrt                        325
Atmosphäre                      272
Sicherheit                      197
Ticketkauf                      158
Service_und_Kundenbetreuung      79
Informationen                    79
Connectivity                     51
Auslastung_und_Platzangebot      44
DB_App_und_Website               35
Komfort_und_Ausstattung          29
Barrierefreiheit                 13
Reisen_mit_Kindern               10
Toiletten                         8
Design                            6
Gastronomisches_Angebot           6
Gepäck                         

## Diachronic test data

In [7]:
desc_analysis(test_dia_df, test_dia_df_op, test_dia_df_cat, test_dia_df_cat_pol, cats, cats_pol)

complete number of observations
1842

relevance distribution
col_0      count
relevance       
True        1547
False        295

sentiment distribution
col_0      count
sentiment       
neutral     1237
negative     497
positive     108

aspect distribution of documents with opinion
col_0                         count
aspect                             
Allgemein                      1042
Sonstige_Unregelmässigkeiten    276
Zugfahrt                        250
Connectivity                    108
Atmosphäre                      101
Sicherheit                       80
Ticketkauf                       63
Informationen                    39
Service_und_Kundenbetreuung      29
DB_App_und_Website               26
Auslastung_und_Platzangebot      23
Toiletten                        12
Komfort_und_Ausstattung          11
Gepäck                            6
Gastronomisches_Angebot           4
Image                             4
Barrierefreiheit                  2
Design                         