### Importing the Packages

In [1]:
import time
import pickle
import sys
import re
import string
import collections
import codecs
from io import BytesIO

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import matplotlib as mpl

import nltk
from nltk.corpus import stopwords
from nltk.tag.sequential import ClassifierBasedTagger
#from nltk.stem.wordnet import WordNetLemmatizer
#from nltk.tokenize import word_tokenize

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_colwidth', 120)

### Importing the Frage database

In [2]:
frage_complete = pd.read_pickle('./frage.pickle')

In [3]:
frage_complete[['id', 'fragebogen_num', 'frage_num', 'original_frage', 'kurzfrage',]].head()

Unnamed: 0,id,fragebogen_num,frage_num,original_frage,kurzfrage
0,1,1,A1,"Kopf: Kopf, Haupt; auch scherzh./übertr.","Kopf, Haupt; auch scherzh./übertr."
1,2,1,A2,"Kopf: Kopf/Haupt (in urspr. Bed.) in Vergl./Ra. (Kopf stehn, der Kopf möchte einem zerspringen)",Kopf/Haupt (urspr.Bed.) in Vergl./Ra.*
2,3,1,A3,"Kopf: Kopf/Haupt (übertr.) in Vergl./Ra. (das ist ein feiner/offener Kopf, jem. den Kopf verdrehen)",Kopf/Haupt (übertr.) in Vergl./Ra.*
3,4,1,A4,"Kopf: schöner, ebenmäßig gebauter Kopf (Christuskopf, Tituskopf)","schöner, ebenmäßiger Kopf (Christus-, Titusk.)"
4,5,1,A4a,"Kopf: häßlicher, unebenmäßiger Kopf","häßlicher, unebenmäßiger Kopf"


In [4]:
frage_complete[['id', 'fragebogen_num', 'frage_num', 'original_frage', 'kurzfrage',]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16738 entries, 0 to 16743
Data columns (total 5 columns):
id                16738 non-null int64
fragebogen_num    16738 non-null int64
frage_num         16738 non-null object
original_frage    16738 non-null object
kurzfrage         16738 non-null object
dtypes: int64(2), object(3)
memory usage: 784.6+ KB


### Questions patterns

See:  
- [Document1](https://docs.google.com/document/d/1x_7nZmPZRva5LI_C8KgUzloSryaqwnaQsGlP7G-_VS0/edit#)  
- [Document2](https://docs.google.com/document/d/1Tpr4TvvIwqQLS7t9xQK0EPLErbTDuF5c4a0BdwsJ9wA/edit)  
- [Document3](https://docs.google.com/spreadsheets/d/1xxb-CjJwU_P44cCFr4II-7QFXaR5FB6ofcwMZRAhSmQ/edit#gid=1846077890)  

#### Examining general patterns

In [5]:
#frage_complete[frage_complete.original_frage.str.contains(pat = '\?')]['original_frage']
frage_complete[frage_complete.original_frage.str.contains(pat = 'wie |wer |was ', regex = True)]['original_frage']

116      Palmso.: Palmbuschen, Sachl.: woraus besteht er? (Kranewit, Wachholder, Gromelsträußlein, Waxlaub, Stechpalme, Schrä...
118      Palmso.: was geschieht m. d. Palmbuschen/-gerten? (Befestigg./Aufbewahrg. hinter Stubenkruzifixen, unter d. Dache, i...
124                                                       Palmso.: Palmesel; auch als SpottN?; Ra. (aufgeputzt wie ein Palmesel)
126      Palmso.: wird bei d. Palmeselfahrt e. Christbild in Holzplastik mitgeführt und wie heißt es?; wird Christus durch Pe...
129            Palmso.: wie heißt Woche vor d. Palmsonntag? haben Tage dieser Woche bes. Namen? (Palmfreitag, schmerzh. Freitag)
                                                                  ...                                                           
16466                                          Gewerbe: Goldarbeiter, Goldschmied; Ra. wie: Der macht's wie des Goldschmieds Bub
16472    Gewerbe: Gold; Ra. wie: Es ist nicht alles Gold, was glänzt; das ist nicht mit Gold zu b

### Defining the patterns for classification

In [6]:
# Linguistic patterns

onomasiological_naming = ['Bez.', '-bez.', 'Bezeichng.','wie nennt man','was bezeichnet',
                          'wie heißen','wie heißt', 'Benennung', 'Benenng']
onomasiological_drawing = []
semasiological_meaning = ['Bedeutung', 'Bed.', 'was bedeutet']
illustration_drawing = []
morphological_compound_1 = ['Komp.']
morphological_compound_2 = ['Gw.']
morphological_compound_3 = ['Bw.']
morphological_diminutive = ['Dem.']
morphological_plural = ['Plural','Pl.']
morphological_forms = []
morphological_derivatives = ['Abl.','Ableitung']
syntactic_constructions = ['Fügungen', 'Wendungen','Füg.']
syntactic_idioms = ['Sprichw.', 'Ra.']
phonological_pronunciation = ['Aussprache','Ausspr.', 'Ltg.']
metaphorical_conveyed_meanings = ['Übertr.', 'übertragener Bedeutung']
synonyms = ['Syn.', 'Synonym']
synonyms_contrast = ['Ggs.','Gegensatz'] 
synonyms_comparison = ['Vgl.','Vergl.','Vergleich']
agreement_yes_no = []
agreement_pronunciation = ['Ltg.','Lautung','Lautungen']
temporal_question = ['Wann ']
location_question = ['Wo ']
modal_question = ['Wie ']
agent_question = ['Wer ']
causal_question = ['Wozu ']


# Cultural patterns
description_events = ['Beschr.','Beschreibg.','Beschreibung']
description_sayings = ['Ausdrücke']
cultural_beliefs = ['Aberglaube','Volksaberglaube','Volksglaube']
cultural_event = ['Bräuche', 'Brauch', 'dabei', 'Ausdrücke']
cultural_medicine = ['Volksheilmittel','Heilmittel','Blutreinigungsmittel','volkstüml','Gegenmittel']
cultural_prayer = ['Gebete']
cultural_songs = ['Lieder','Gesangs']
cultural_humor = ['Volkswitze','Mesnerwitze']
cultural_games_1 = ['Finger','Spiel','Kinderspiel','Laufspiele','spielen']
cultural_games_2 = ['Ausdrücke', 'Spiel']
cultural_food = ['zutat ','speise ','braten ','Gebäck ','Apfel ','Kartoffel ','Essen ','Bier ','trinken ']
cultural_living_organisms = []
cultural_dances = ['Tänze ','Tanz ']

patterns_ling = ['onomasiological_naming', 
                 'onomasiological_drawing',
                 'semasiological_meaning', 
                 'agreement_yes_no', 
                 'agreement_pronunciation', 
                 'illustration_drawing', 
                 'morphological_compound_1',
                 'morphological_compound_2', 
                 'morphological_compound_3',
                 'morphological_diminutive', 
                 'morphological_plural', 
                 'morphological_forms', 
                 'morphological_derivatives', 
                 'syntactic_constructions', 
                 'syntactic_idioms',
                 'phonological_pronunciation',
                 'metaphorical_conveyed_meanings',
                 'synonyms',
                 'synonyms_contrast', 
                 'synonyms_comparison',
                 'temporal_question',
                 'location_question',
                 'modal_question',
                 'agent_question',
                 'causal_question'
                ]
                 
patterns_cult = ['description_events', 
                 'description_sayings', 
                 'cultural_beliefs',
                 'cultural_event', 
                 'cultural_medicine', 
                 'cultural_prayer', 
                 'cultural_songs',
                 'cultural_humor', 
                 'cultural_games_1', 
                 'cultural_games_2', 
                 'cultural_food', 
                 'cultural_living_organisms', 
                 'cultural_dances'
                ]

### Creating function to apply patterns

In [7]:
def assign_type(entry, patterns_list):
    question_type = None
    # Aplying patterns
    for patterns in patterns_list:
        if eval(patterns):
            for pattern in eval(patterns):
                if pattern.lower() in entry.lower():
                    if question_type == None:
                        question_type = [patterns]
                    elif question_type == [patterns]:
                        break
                    else:
                        question_type.append(patterns)
                    break
    if patterns_list == patterns_ling:
        question_type = check_size(entry, question_type)
    return question_type

def check_size(entry, question_type):                
    # Questions with less than 2 words after semicolon will be classified as 'onomasiological_naming'             
    if len(entry[entry.find(':')+1:].split()) < 3:  
        if question_type == None:
            question_type = ['onomasiological_naming']
        else:
            question_type.append('onomasiological_naming')
    return question_type    

### Applying patterns:

In [8]:
frage_complete['frage_ling_type'] = frage_complete.original_frage.apply(assign_type, args=(patterns_ling,))
print(f"{len(frage_complete['frage_ling_type'].dropna())} questions were classified out of {len(frage_complete)}")

9557 questions were classified out of 16738


In [9]:
frage_complete['frage_cult_type'] = frage_complete.original_frage.apply(assign_type, args=(patterns_cult,))
print(f"{len(frage_complete['frage_cult_type'].dropna())} questions were classified out of {len(frage_complete)}")

1404 questions were classified out of 16738


### Analysing linguistic classes:

In [10]:
print('Number of questions per number of linguistic classes:')
frage_complete['frage_ling_type'].apply(lambda x:len(x) if x else None).value_counts(dropna=False)

Number of questions per number of linguistic classes:


1.0    7325
NaN    7181
2.0    1398
3.0     627
4.0     165
5.0      29
6.0       8
7.0       5
Name: frage_ling_type, dtype: int64

In [11]:
print('Number of total classes assigned:\n')
tags = []
for taglist in frage_complete['frage_ling_type']:
    if taglist:
        tags.extend(taglist)
tags = collections.Counter(tags)
for t, count in tags.most_common(33):
    print(count,'\t', t)

Number of total classes assigned:

5198 	 onomasiological_naming
1662 	 morphological_compound_1
1491 	 syntactic_idioms
966 	 semasiological_meaning
698 	 morphological_plural
623 	 modal_question
478 	 morphological_diminutive
394 	 syntactic_constructions
242 	 phonological_pronunciation
219 	 morphological_derivatives
199 	 synonyms
156 	 agreement_pronunciation
131 	 metaphorical_conveyed_meanings
87 	 morphological_compound_2
83 	 agent_question
68 	 morphological_compound_3
61 	 location_question
60 	 temporal_question
41 	 synonyms_comparison
21 	 synonyms_contrast
12 	 causal_question


In [12]:
# Examining questions with few (>2) words:
#id_short_questions = frage_complete['original_frage'].str.split().apply(lambda x:len(x) < 3)
#frage_complete.loc[id_short_questions, ['original_frage','frage_ling_type', 'frage_cult_type']].head(10)

### Analysing cultural classes:

In [13]:
print('Number of questions per number of cultural classes:')
frage_complete['frage_cult_type'].apply(lambda x:len(x) if x else None).value_counts(dropna=False)

Number of questions per number of cultural classes:


NaN    15334
1.0      737
3.0      444
2.0      161
4.0       56
5.0        6
Name: frage_cult_type, dtype: int64

In [14]:
print('Number of total classes assigned:\n')
tags = []
for taglist in frage_complete['frage_cult_type']:
    if taglist:
        tags.extend(taglist)
tags = collections.Counter(tags)
for t, count in tags.most_common(33):
    print(count,'\t', t)

Number of total classes assigned:

759 	 cultural_event
605 	 cultural_games_2
467 	 description_sayings
243 	 cultural_games_1
172 	 cultural_beliefs
166 	 cultural_food
104 	 cultural_medicine
54 	 description_events
44 	 cultural_songs
16 	 cultural_prayer
8 	 cultural_dances
7 	 cultural_humor


### Examining classified questions

In [15]:
frage_complete.loc[frage_complete[['frage_ling_type',
                                   'frage_cult_type']].dropna(how='all').index, ['original_frage',
                                                                                 'frage_ling_type',
                                                                                 'frage_cult_type']].head(20)

Unnamed: 0,original_frage,frage_ling_type,frage_cult_type
0,"Kopf: Kopf, Haupt; auch scherzh./übertr.",[metaphorical_conveyed_meanings],
1,"Kopf: Kopf/Haupt (in urspr. Bed.) in Vergl./Ra. (Kopf stehn, der Kopf möchte einem zerspringen)","[semasiological_meaning, syntactic_idioms, synonyms_comparison]",
2,"Kopf: Kopf/Haupt (übertr.) in Vergl./Ra. (das ist ein feiner/offener Kopf, jem. den Kopf verdrehen)","[syntactic_idioms, metaphorical_conveyed_meanings, synonyms_comparison]",
6,Kopf: Wasserkopf,[onomasiological_naming],
7,Kopf: kleiner Kopf,[onomasiological_naming],
9,Kopf: langer Kopf,[onomasiological_naming],
11,Kopf: runder Kopf,[onomasiological_naming],
12,Kopf: breiter Kopf,[onomasiological_naming],
13,Kopf: schmaler Kopf,[onomasiological_naming],
14,Kopf: schiefer Kopf,[onomasiological_naming],


### Examining unclassified questions

In [16]:
#frage_complete.style.set_properties(subset=['original_frage'], **{'width': '400px'})
frage_complete.loc[frage_complete['frage_ling_type'].isnull() & frage_complete['frage_cult_type'].isnull(), 
                   ['original_frage']].head(50)

Unnamed: 0,original_frage
3,"Kopf: schöner, ebenmäßig gebauter Kopf (Christuskopf, Tituskopf)"
4,"Kopf: häßlicher, unebenmäßiger Kopf"
5,Kopf: großer Kopf; großkopfig
8,Kopf: oben zugespitzter Kopf
10,"Kopf: kurzer, platter Kopf"
15,Kopf: Kopf mit dunkler Gesichtsfarbe und krausem Haar (Mohrenkopf)
16,"Kopf: mißgestalteter Kopf (Fischkopf ""Kindskopf mit fischartig aufgezogener Nase und Oberlippe"", Hasenkopf ""Kopf mit..."
17,"Kopf: Kopf mit Ungeziefer behaftet (Laus-, Nißkopf)"
18,"Kopf: Krankheiten des Kopfes (Kopfweh, -typhus)"
19,"Kopf: durch Krankheiten entstellter Kopf (Krätzen-, Grindkopf)"


### Exporting processed files 

In [17]:
frage_complete[['original_frage','frage_ling_type','frage_cult_type']].to_excel('./questions_class.xlsx')

In [18]:
frage_complete[['original_frage','frage_ling_type','frage_cult_type']].to_pickle('./questions_class.pickle')