In [None]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [3]:
def extract_mode(text):
    doc = nlp(text)
    has_mode = False

    for token in doc:
        if token.pos_ == "AUX" or (token.pos_ == "VERB" and token.lemma_ in ["can", "could", "may", "might", "must", "shall", "should", "will", "would"]):
            has_mode = True
            break

        elif token.pos_ == "ADV" and token.lemma_ in ["necessarily", "probably", "possibly", "perhaps", "certainly"]:
            has_mode = True
            break

        elif token.pos_ == "VERB" and token.dep_ == "ROOT" and token.i == 0:
            has_mode = True
            break

    return 1 if has_mode else 0

In [4]:
def extract_intention(text):
    doc = nlp(text)
    has_intention = False

    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in ["want", "need", "desire", "intend", "wish", "plan", "aim", "decide", "hope"]:
            has_intention = True
            break

        elif token.text == "to" and token.head.pos_ == "VERB" and token.dep_ == "aux":
            has_intention = True
            break

        elif token.text in ["so", "that", "order", "as"] and token.dep_ in ["mark", "advmod"]:
            if token.head.dep_ == "advcl" and token.head.head.pos_ == "VERB":
                has_intention = True
                break

        elif token.pos_ == "AUX" and token.lemma_ in ["will", "shall"]:
            has_intention = True
            break

    return 1 if has_intention else 0

In [5]:
def extract_result(text):
    doc = nlp(text)
    has_result = False

    for token in doc:
        if token.lemma_ in ["have", "has", "had"] and token.dep_ == "aux" and token.head.tag_ == "VBN":
            has_result = True
            break

        if token.pos_ == "VERB":
            for child in token.children:
                if child.pos_ in ["ADJ", "PART"]:
                    has_result = True
                    break
                
                elif child.pos_ == "PROPN" and (child.dep_ in ["attr", "dobj", "acomp", "oprd"] or child.head == token):
                    has_result = True
                    break

        if token.pos_ == "ADV" and token.lemma_ in ["so", "therefore", "thus", "hence"]:
            if token.dep_ == "cc" or token.head.dep_ == "conj":
                has_result = True
                break

        elif token.pos_ == "SCONJ" and token.lemma_ in ["after", "because", "since", "as"]:
            has_result = True
            break
        
        if "as a result" in text:
            has_result = True
            break

    return 1 if has_result else 0

In [6]:
def extract_manner(text):
    doc = nlp(text)
    has_manner = False

    for token in doc:
        if token.pos_ == "ADV" and token.dep_ == "advmod":
            has_manner = True
            break

    if has_manner == 0:
        for token in doc:
            if token.pos_ == "ADP" and token.dep_ == "prep":
                for child in token.children:
                    if child.dep_ in ["pobj", "obj"]:
                        has_manner = True
                        break
                if has_manner:
                    break

    if has_manner == 0:
        for token in doc:
            if token.pos_ == "ADV" and token.dep_ == "advmod":
                if token.head.pos_ == "ADV":
                    has_manner = True
                    break

    return 1 if has_manner else 0

In [7]:
def extract_aspect(text):
    doc = nlp(text)
    has_aspect = False

    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in ["start", "finish", "continue", "begin", "stop"]:
            has_aspect = True
            break

        if token.lemma_ in ["have", "has", "had"] and token.dep_ == "aux" and token.head.tag_ == "VBN":
            has_aspect = True
            break

        if token.lemma_ in ["be"] and token.dep_ == "aux" and token.head.tag_.startswith("VBG"):
            has_aspect = True
            break

        if token.pos_ == "ADV" and token.lemma_ in ["already", "yet", "still"]:
            has_aspect = True
            break

        if token.pos_ == "AUX" and token.lemma_ in ["will", "have", "be"]:
            has_aspect = True
            break

    return 1 if has_aspect else 0

In [8]:
def extract_status(text):
    doc = nlp(text)
    has_status = False
    
    tokens = [token.text for token in doc]

    multi_word_negations = [
        "no longer", "not at all", "never again", "not really", "not yet", "not sure", "don't know"
    ]

    for phrase in multi_word_negations:
        if phrase in text:
            has_status = True
            break

    if not has_status:
        for token in doc:
            if token.lemma_ in ["not", "never", "no"]:
                has_status = True
                break

            if token.dep_ == "neg" and token.head.pos_ in ["AUX", "VERB"]:
                has_status = True
                break

            if token.dep_ == "neg":
                has_status = True
                break

    return 1 if has_status else 0

In [9]:
def extract_appearance(text):
    doc = nlp(text)
    has_appearance = False

    for token in doc:
        if token.pos_ == "CCONJ" or (token.pos_ == "PRON" and token.lemma_ in ["which", "that"]):
            has_appearance = True
            break

    if not has_appearance:
        for token in doc:
            if token.pos_ == "VERB" and token.lemma_ in ["become", "turn", "transform", "change"]:
                has_appearance = True
                break

    if not has_appearance:
        for token in doc:
            if token.dep_ == "neg" and token.head.pos_ == "VERB" and token.head.lemma_ in ["become", "turn", "transform", "change"]:
                has_appearance = True
                break

    return 1 if has_appearance else 0

In [10]:
def extract_knowledge(text):
    doc = nlp(text)
    has_knowledge = False

    knowledge_verbs = {"know", "realize", "remember", "learn", "recognize", "understand", "believe", "think", "see", "hear", "feel", "notice", "say", "tell", "inform", "report", "observe"}
    
    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in knowledge_verbs:
            has_knowledge = True
            break

        if token.dep_ == "mark" and token.text.lower() in ["that"]:
            has_knowledge = True
            break

        if token.dep_ == "dobj" and token.head.pos_ == "VERB" and token.head.lemma_ in knowledge_verbs:
            has_knowledge = True
            break

    return 1 if has_knowledge else 0

In [11]:
def extract_description(text):
    doc = nlp(text)
    has_description = False

    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in ["say", "tell", "explain", "describe", "report", "narrate", "inform"]:
            has_description = True
            break

        if token.dep_ == "ccomp" and token.head.pos_ == "VERB" and token.head.lemma_ in ["say", "tell", "explain", "describe", "report", "narrate", "inform"]:
            has_description = True
            break

        if token.pos_ == "VERB" and token.lemma_ in ["say", "tell", "explain", "describe", "report", "narrate", "inform"] and token.dep_ == "ROOT":
            for child in token.children:
                if child.pos_ == "NOUN" and child.text.startswith('"'):
                    has_description = True
                    break

        if token.pos_ == "VERB" and token.lemma_ in ["tell", "inform", "narrate"] and token.dep_ == "ROOT":
            for child in token.children:
                if child.dep_ == "ccomp":
                    has_description = True
                    break

        if (token.dep_ == "ccomp" or token.dep_ == "xcomp") and token.head.pos_ == "VERB" and token.head.lemma_ in ["say", "tell", "explain", "describe", "report", "narrate", "inform"]:
            has_description = True
            break

        if (token.dep_ == "amod" or token.dep_ == "advmod") and token.head.pos_ == "VERB" and token.head.lemma_ in ["say", "tell", "explain", "describe", "report", "narrate", "inform"]:
            has_description = True
            break

    return 1 if has_description else 0

In [12]:
def extract_supposition(text):
    doc = nlp(text)
    has_supposition = False

    for token in doc:
        if token.lemma_ in ["will", "would", "might", "may", "could", "should"]:
            has_supposition = True
            break

        if token.pos_ == "SCONJ" and token.lemma_ == "if":
            has_supposition = True
            break

        if token.pos_ == "VERB" and token.lemma_ in ["expect", "predict", "assume", "suppose", "anticipate"]:
            has_supposition = True
            break

        if token.pos_ == "ADV" and token.lemma_ in ["probably", "possibly", "maybe", "likely"]:
            has_supposition = True
            break

        if token.dep_ in ["aux", "advmod", "ccomp"]:
            has_supposition = True
            break

    return 1 if has_supposition else 0

In [13]:
def extract_subjectivation(text):
    doc = nlp(text)
    has_subjectivation = False

    for token in doc:
        if token.pos_ == "PRON" and token.lemma_.lower() in ["i", "you", "he", "she", "it", "we", "they"]:
            has_subjectivation = True
            break

        if token.pos_ == "VERB" and token.lemma_ in ["think", "believe", "feel", "perceive", "consider"]:
            has_subjectivation = True
            if token.dep_ == "ROOT":
                has_subjectivation = True
                break

        if token.pos_ == "VERB" and token.dep_ == "ROOT" and token.tag_ == "VBZ":
            for child in token.children:
                if child.dep_ == "nsubj" and child.pos_ == "PRON":
                    has_subjectivation = True
                    break

        if token.pos_ == "ADJ" and token.dep_ == "ccomp":
            has_subjectivation = True
            break

        if token.pos_ == "ADJ" and token.dep_ == "amod" and token.head.pos_ == "PRON":
            has_subjectivation = True
            break

        if token.dep_ in ["nsubj", "csubj"]:
            has_subjectivation = True
            break

    return 1 if has_subjectivation else 0

In [14]:
def extract_attitude(text):
    doc = nlp(text)
    has_attitude = False

    emotion_verbs = {
        "feel", "love", "hate", "enjoy", "fear", "worry", 
        "regret", "like", "dislike", "admire", "appreciate", 
        "resent", "cherish", "despise", "adore", "savor", 
        "lament", "yearn", "long", "speak", "disappoint"}

    emotion_adjectives = {
        "happy", "sad", "angry", "excited", "anxious", 
        "disappointed", "elated", "frustrated", "content", 
        "nervous", "guilty", "hopeful", "relieved", 
        "pleased", "joyful", "upset", "bored", 
        "embarrassed", "pessimistic", "optimistic", 
        "euphoric", "distraught", "jubilant", 
        "melancholic", "overjoyed"
    }
    
    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in emotion_verbs:
            has_attitude = True
            break
        
        if token.pos_ == "ADJ" and token.lemma_ in emotion_adjectives:
            has_attitude = True
            break
        
        if token.pos_ == "ADV" and token.dep_ == "advmod" and token.head.pos_ == "VERB" and token.head.lemma_ in emotion_verbs:
            has_attitude = True
            break
        
        if token.pos_ == "VERB" and token.lemma_ in ["see", "hear", "feel"] and token.head.pos_ == "ADJ":
            has_attitude = True
            break
        
        if token.pos_ == "INTJ":
            has_attitude = True
            break
        
        if token.pos_ == "VERB" and token.lemma_ in emotion_verbs:
            for child in token.children:
                if child.dep_ == "nsubj":
                    has_attitude = True
                    break

        if token.pos_ == "ADJ" and token.dep_ == "amod" and token.head.dep_ == "nsubj":
            has_attitude = True
            break

        if token.dep_ in ["nsubj", "amod", "advmod"]:
            has_attitude = True
            break

    return 1 if has_attitude else 0

In [15]:
def extract_comparative(text):
    doc = nlp(text)
    has_comparative = False

    comparative_phrases = [
        "than", "compared to", "in comparison with", "versus", "in relation to",
        "as opposed to", "more than", "less than", "greater than", "smaller than",
        "better than", "worse than", "superior to", "inferior to", "like", "unlike",
        "rather than", "instead of"
    ]
    
    comparative_words = {"more", "less", "better", "worse"}
    superlative_words = {"most", "least", "best", "worst"}

    for token in doc:
        if (token.pos_ == "ADJ" or token.pos_ == "ADV") and token.lemma_.endswith("er"):
            has_comparative = True
            break

        if (token.pos_ == "ADJ" or token.pos_ == "ADV") and token.lemma_.endswith("est"):
            has_comparative = True
            break

        if token.text.lower() in superlative_words:
            has_comparative = True
            break

        if token.text.lower() in comparative_words:
            has_comparative = True
            break

        if token.text.lower() in comparative_phrases:
            has_comparative = True
            break

        if token.dep_ in ["amod", "advmod"]:
            if token.head.pos_ in ["ADJ", "ADV"]:
                if token.head.lemma_.endswith("er") or token.head.text.lower() in comparative_words:
                    has_comparative = True
                    break
                elif token.head.lemma_.endswith("est") or token.head.text.lower() in superlative_words:
                    has_comparative = True
                    break

    return 1 if has_comparative else 0

In [16]:
def extract_quantifier(text):
    doc = nlp(text)
    has_quantifier = False

    degree_expressions = ["a lot of", "a little", "enough", "plenty of"]
    proportional_phrases = ["half", "most", "majority of", "part of", "fraction of"]

    for token in doc:
        if (token.pos_ == "DET" or token.pos_ == "ADJ") and token.lemma_ in ["all", "some", "many", "few", "several", "much", "little", "none"]:
            has_quantifier = True
            break

        if token.pos_ == "NUM":
            has_quantifier = True
            break

        if token.text in ["a", "lot", "little", "plenty", "majority"]:
            span = " ".join([w.text for w in token.subtree])
            if span in degree_expressions:
                has_quantifier = True
                break

        if token.text in proportional_phrases:
            has_quantifier = True
            break

        if token.dep_ in ["nummod", "det"]:
            has_quantifier = True
            break

        if token.pos_ == "ADV" and token.lemma_ in ["almost", "nearly", "approximately", "about"]:
            has_quantifier = True
            break

    return 1 if has_quantifier else 0

In [17]:
def extract_qualification(text):
    doc = nlp(text)
    has_qualification = False

    for token in doc:
        if token.pos_ == "ADJ" and token.dep_ == "amod":
            has_qualification = True
            break

        if token.pos_ == "ADV" and token.dep_ == "advmod" and token.head.pos_ == "ADJ":
            has_qualification = True
            break

        if token.pos_ == "ADJ" and token.dep_ == "amod" and token.head.pos_ == "NOUN":
            has_qualification = True
            break

        if token.pos_ == "ADJ" and token.tag_ in {"VBN", "VBG", "VBP"}:
            has_qualification = True
            break

        if token.dep_ in ["amod", "advmod"]:
            has_qualification = True
            break

        if token.dep_ == "relcl":
            has_qualification = True
            break

    return 1 if has_qualification else 0

In [18]:
def extract_explanation(text):
    doc = nlp(text)
    has_explanation = False

    explanatory_conjunctions = ["because", "since", "therefore", "so"]
    explicative_phrases = ["in other words", "namely"]

    for token in doc:
        if token.dep_ in ["acl", "relcl"]:
            span = list(token.subtree)
            has_explanation = True
            break

        if token.dep_ == "punct" and token.text in ["(", ")"]:
            parenthetical_span = list(token.subtree)
            if len(parenthetical_span) > 1:
                has_explanation = True
                break

        if token.pos_ == "SCONJ" and token.lemma_ in explanatory_conjunctions:
            has_explanation = True
            break

        if token.dep_ == "appos":
            span = list(token.subtree)
            has_explanation = True
            break

        if token.text.lower() in explicative_phrases:
            has_explanation = True
            break

    return 1 if has_explanation else 0

In [19]:
def create_feature_vector(text):
    mode_feature = extract_mode(text)
    intention_feature = extract_intention(text)
    result_feature = extract_result(text)
    manner_feature = extract_manner(text)
    aspect_feature = extract_aspect(text)
    status_feature = extract_status(text)
    appearance_feature = extract_appearance(text)
    knowledge_feature = extract_knowledge(text)
    description_feature = extract_description(text)
    supposition_feature = extract_supposition(text)
    subjectivation_feature = extract_subjectivation(text)
    attitude_feature = extract_attitude(text)
    comparative_feature = extract_comparative(text)
    quantifier_feature = extract_quantifier(text)
    qualification_feature = extract_qualification(text)
    explanation_feature = extract_explanation(text)
    return [
        mode_feature, intention_feature, result_feature, manner_feature,
        aspect_feature, status_feature, appearance_feature, knowledge_feature,
        description_feature, supposition_feature, subjectivation_feature, attitude_feature,
        comparative_feature, quantifier_feature, qualification_feature, explanation_feature
        ]

In [None]:
df = pd.read_csv('../data/sample_dataset_eng.csv')

df['features'] = df['sentence'].apply(create_feature_vector)

features_df = pd.DataFrame(df['features'].tolist(), columns=['mode', 'intention', 'result', 'manner',
                                                             'aspect', 'status', 'appearance', 'knowledge',
                                                             'description', 'supposition', 'subjectivation', 'attitude',
                                                             'comparative', 'quantifier', 'qualification', 'explanation'])

print(features_df)

features_df.to_csv('../data/feature_vectors_eng.csv', index=False)