In [None]:
import pandas as pd
import os
import numpy as np

import re

import nltk
nltk.download('vader_lexicon')

# For sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
df = pd.read_csv("aita_clean.csv")

In [None]:
# These are the features from https://www.aclweb.org/anthology/W18-1110.pdf

# In general, every function starting with 'get' is applied to the data frame, the others are helpers

In [None]:
# Get average sentence length as a feature
def calcSentenceLength(text):
    sent_text = nltk.sent_tokenize(text)
    total_len = 0
    num_sent = len(sent_text)
    for s in sent_text:
        tokenized = nltk.word_tokenize(s)
        total_len = total_len + len(tokenized)
    return total_len/num_sent

def getSentLength(df):
    df['sentence_length'] = df['text'].apply(lambda text: calcSentenceLength(text))

In [None]:
# Negation as a feature
def calcNegationCount(text):
    text = text.split()
    negation = re.compile(r"""(?:^(?:no|not)$)|n't""", re.verbose)
    count = 0
    for w in text:
        if negation.search(w):
            count += 1
    return count

def getNegationCount(df):
    df['negations'] = df['text'].apply(lambda text: calcNegationCount(text))


In [None]:
# Question words as a feature - as per Durmus et al
# “why”,“when”,“how”,“what”,“who”,“whose”,“whom”,“where”, “whose”,“whether”
# Not 100% sure this will work
def getQuestionWords(df):
    df['question_words'] = df['text'].apply(lambda text: len(re.findall(r'(where|why|when|how|what|who|whose|whom|whose|whether)', text)))
    

In [None]:
# POS tag types
# Proper nouns
# Cardinals
# Existential there
# Personal pronouns - this was listed separately in the paper

# list of tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

# Get POS given some text
def addPosTags(text):
    text = word_tokenize(text)
    pos = nltk.pos_tag(text)
    return pos

# Counts parts of speech in text
def findPOSCount(text, pos_strings):
    pos = addPosTags(text)
    count = 0
    for w in pos:
        # Check the tag to see what it is
        if w[1] is in pos_strings:
            count = count + 1
    return count

def getPOSCounts(df):
    # Different tags to search for
    proper_nouns = ['NNP', 'NNPS']
    cardinals = ['CD']
    exist_there = ['EX']
    pers_pronouns = ['PRP']
    # Adding counts of tags to database
    df['proper_nouns'] = df['text'].apply(lambda text: findPosCount(text, proper_nouns))
    df['cardinals'] = df['text'].apply(lambda text: findPosCount(text, cardinals))
    df['existential_there'] = df['text'].apply(lambda text: findPosCount(text, exist_there))
    df['personal_pronouns'] = df['text'].apply(lambda text: findPosCount(text, pers_pronouns))

In [None]:
# negative,positive and ambiguous emotions.
def getSentiment(df):
    # Dictionary with all scores
    df['scores'] = df['text'].apply(lambda text: sid.polarity_scores(text))
    df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound']) # Compound score
    df['pos_neg'] = df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg') # Add pos/neg labels

In [None]:
# Named entity mentions
# TODO