In [16]:
import pandas as pd
import os
import numpy as np

import re
import pprint

import nltk
nltk.download('vader_lexicon')

# For sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Parts of speech
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
df = pd.read_csv("aita_processed.csv")


In [3]:
len(df[df['text'].isna()])
# 87 have 'NA' in the 'text' variable

# Have to drop them for it to run successfully
df = df.dropna(subset=["text"])

In [5]:
# These are the features from https://www.aclweb.org/anthology/W18-1110.pdf

# In general, every function starting with 'get' is applied to the data frame, the others are helpers

In [6]:
# Get average sentence length as a feature
def calcSentenceLength(text):
    sent_text = nltk.sent_tokenize(text)
    total_len = 0
    num_sent = len(sent_text)
    for s in sent_text:
        tokenized = nltk.word_tokenize(s)
        total_len = total_len + len(tokenized)
    return total_len/num_sent

def getSentLength(df):
    df['sentence_length'] = df['text'].apply(lambda text: calcSentenceLength(text))

In [13]:
# Negation as a feature
def calcNegationCount(text):
    text = text.split()
    negation = re.compile(r"""(?:^(?:no|not)$)|n't""")
    count = 0
    for w in text:
        if negation.search(w):
            count += 1
    return count

def getNegationCount(df):
    df['negations'] = df['text'].apply(lambda text: calcNegationCount(text))


In [8]:
# Question words as a feature - as per Durmus et al
# “why”,“when”,“how”,“what”,“who”,“whose”,“whom”,“where”, “whose”,“whether”
# Not 100% sure this will work
def getQuestionWords(df):
    df['question_words'] = df['text'].apply(lambda text: len(re.findall(r'(where|why|when|how|what|who|whose|whom|whose|whether)', text)))
    

In [9]:
# POS tag types
# Proper nouns
# Cardinals
# Existential there
# Personal pronouns - this was listed separately in the paper

# list of tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

# Get POS given some text
def addPosTags(text):
    text = nltk.word_tokenize(text)
    pos = nltk.pos_tag(text)
    return pos

# Counts parts of speech in text
def findPOSCount(text, pos_strings):
    pos = addPosTags(text)
    count = 0
    for w in pos:
        # Check the tag to see what it is
        if w[1] in pos_strings:
            count = count + 1
    return count

def getPOSCounts(df):
    # Different tags to search for
    proper_nouns = ['NNP', 'NNPS']
    cardinals = ['CD']
    exist_there = ['EX']
    pers_pronouns = ['PRP']
    # Adding counts of tags to database
    df['proper_nouns'] = df['text'].apply(lambda text: findPOSCount(text, proper_nouns))
    df['cardinals'] = df['text'].apply(lambda text: findPOSCount(text, cardinals))
    df['existential_there'] = df['text'].apply(lambda text: findPOSCount(text, exist_there))
    df['personal_pronouns'] = df['text'].apply(lambda text: findPOSCount(text, pers_pronouns))

In [10]:
# negative,positive and ambiguous emotions.
def getSentiment(df):
    # Dictionary with all scores
    print("Getting sentiment scores...")
    df['scores'] = df['text'].apply(lambda text: sid.polarity_scores(text))
    print("Getting compound scores...")
    df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound']) # Compound score
    print("Getting positive/negative labels...")
    df['pos_neg'] = df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg') # Add pos/neg labels

In [14]:
# Extract the features
print(type(df))
print("Extracting Sentiment")
getSentiment(df)
print("Extracting POS counts")
getPOSCounts(df)
print("Extracting Question words")
getQuestionWords(df)
print("Extracting Negations")
getNegationCount(df)
print("Extracting average sentence length")
getSentLength(df)

<class 'pandas.core.frame.DataFrame'>
Extracting Sentiment
Getting sentiment scores...
Getting compound scores...
Getting positive/negative labels...
Extracting POS counts
Extracting Question words
Extracting Negations
Extracting average sentence length


In [24]:
print(df.iloc[0])

Unnamed: 0                                                           1
id                                                              1yu29c
timestamp                                                  1.39328e+09
title                             [AITA] Threw my parent's donuts away
body                 My parents are diabetic, morbidly obese, and a...
edited                                                    1393290576.0
verdict                                                        asshole
score                                                              140
num_comments                                                        27
is_asshole                                                           1
text                 [AITA] Threw my parent's donuts away My parent...
age                                                                NaN
gender                                                             NaN
scores               {'neg': 0.05, 'neu': 0.861, 'pos': 0.089, 'com...
compou

In [None]:
# Creates a new df with all the features added
df.to_csv("aita_features.csv")