In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11de33790>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x11dd16d00>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x11dd16ca0>)]

In [4]:
nlp = spacy.load("en_core_web_sm", disable=['ner'])

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11ee97af0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x11d9f4a00>)]

In [6]:
nlp.pipe_names


['tagger', 'parser']

In [7]:
text = "What kind of sentence is this?"


In [8]:
doc = nlp(text)


In [9]:
parse = [(t.i, t, t.pos_, t.tag_, t.dep_, t.head) for t in doc]


In [10]:
parse

[(0, What, 'DET', 'WDT', 'det', kind),
 (1, kind, 'NOUN', 'NN', 'attr', is),
 (2, of, 'ADP', 'IN', 'prep', kind),
 (3, sentence, 'NOUN', 'NN', 'pobj', of),
 (4, is, 'AUX', 'VBZ', 'ROOT', is),
 (5, this, 'DET', 'DT', 'nsubj', is),
 (6, ?, 'PUNCT', '.', 'punct', is)]

In [11]:
def is_wh_question_v1(doc):
    wh_phrases = ["what", "where", "when", "who", "whom", "how", "why"]
    sent_init_token = doc[0]
    return sent_init_token.text in wh_phrases

def is_polar_question_v1(doc):
    sent_init_token = doc[0]
    return sent_init_token.dep_ == "aux"  # covers both auxiliary & modal verbs

In [12]:
def is_wh_question_v2(doc):
    # "What is your name?"
    wh_tags = ["WDT", "WP", "WP$", "WRB"]
    wh_words = [t for t in doc if t.tag_ in wh_tags]
    sent_initial_is_wh = wh_words and wh_words[0].i == 0

    # Include pied-piped constructions: "To whom did she read the article?"
    pied_piped = wh_words and wh_words[0].head.dep_ == "prep"

    # Exclude pseudoclefts: "What you say is impossible."
    pseudocleft = wh_words and wh_words[0].head.dep_ in ["csubj", "advcl"]
    if pseudocleft:
        return False

    return sent_initial_is_wh or pied_piped


In [13]:
def _is_subject(tok):
    subject_deps = {"csubj", "nsubj", "nsubjpass"}
    return tok.dep_ in subject_deps

def is_polar_question_v2(doc):
    root = [t for t in doc if t.dep_ == "ROOT"][0]  # every spaCy parse as a root token!
    subj = [t for t in root.children if _is_subject(t)]

    if is_wh_question_v2(doc):
        return False

    # Type I: In a non-copular sentence, "is" is an aux.
    # "Is she using spaCy?" or "Can you read that article?"
    aux = [t for t in root.lefts if t.dep_ == "aux"]
    if subj and aux:
        return aux[0].i < subj[0].i

    # Type II: In a copular sentence, "is" is the main verb.
    # "Is the mouse dead?"
    root_is_inflected_copula = root.pos_ == "VERB" and root.tag_ != "VB"
    if subj and root_is_inflected_copula:
        return root.i < subj[0].i

    return False


In [14]:
def get_question_type(sentences):
    for sent in sentences:
        doc = nlp(sent)
        is_wh = is_wh_question_v2(doc)
        is_polar = is_polar_question_v2(doc)
        if is_wh:
            print(sent, "-- wh")
        elif is_polar:
            print(sent, "-- polar")
        else:
            print(sent, "-- not a question")


In [17]:
sentences = ["Is that for real?", "Can you stop?", "Do you love John?", "How do you know him?"]

In [18]:
get_question_type(sentences)

Is that for real? -- not a question
Can you stop? -- polar
Do you love John? -- polar
How do you know him? -- wh
