# LIWC Test
*(Always to be run as sets up important env)*

In [7]:
!sed -i -e '/[<(]/d' dictionaries/LIWC07-EN.dic

sed: can't read /dictionaries/LIWC07-EN.dic: No such file or directory


In [6]:
import liwc
parse, category_names = liwc.load_token_parser('dictionaries/LIWC07-EN.dic')


In [13]:
import re
from collections import Counter

def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

def count_categories(text):
    # Returns a Counter object containing tallies of all LIWC categories
    text = text.lower()
    tokens = tokenize(text)
    counter = Counter(category for token in tokens for category in parse(token))

    # If a category doesn't exist in the text, we need to add it and set the count to 0
    for category in category_names:
        if category not in counter:
            counter[category] = 0

    return counter

gettysburg = "Hello"

print(count_categories(gettysburg))
#=> Counter({'funct': 58, 'pronoun': 18, 'cogmech': 17, ...})

Counter({'social': 1, 'funct': 0, 'pronoun': 0, 'ppron': 0, 'i': 0, 'we': 0, 'you': 0, 'shehe': 0, 'they': 0, 'ipron': 0, 'article': 0, 'verb': 0, 'auxverb': 0, 'past': 0, 'present': 0, 'future': 0, 'adverb': 0, 'preps': 0, 'conj': 0, 'negate': 0, 'quant': 0, 'number': 0, 'swear': 0, 'family': 0, 'friend': 0, 'humans': 0, 'affect': 0, 'posemo': 0, 'negemo': 0, 'anx': 0, 'anger': 0, 'sad': 0, 'cogmech': 0, 'insight': 0, 'cause': 0, 'discrep': 0, 'tentat': 0, 'certain': 0, 'inhib': 0, 'incl': 0, 'excl': 0, 'percept': 0, 'see': 0, 'hear': 0, 'feel': 0, 'bio': 0, 'body': 0, 'health': 0, 'sexual': 0, 'ingest': 0, 'relativ': 0, 'motion': 0, 'space': 0, 'time': 0, 'work': 0, 'achieve': 0, 'leisure': 0, 'home': 0, 'money': 0, 'relig': 0, 'death': 0, 'assent': 0, 'nonfl': 0, 'filler': 0})


# Text Analysis
Go through each entry in the text, and for each entry, add the following information:
- The publish utc timestamp of the comment/post
- The counter of the dimensions

The data structure to be returned will be a pandas dataframe, with the first column being the utc timestamp, and the second column is a counter of the dictionary of LIWC categories

E.g.: ```1000 | Counter({'social': 1...})]```

Everything will also get saved into a csv file after, first column is the utc timestamp, second column the dict

In [22]:
# Set the Reddit Data File Path (JSON)
data_file_path = "data/texas_comments.json"


import tqdm
import pandas as pd
import json
import os

df = pd.DataFrame(columns = ['utc_timestamp', 'LIWC_categories'])

# Intialise the progress bar based on the number of items in the data file
with open(data_file_path) as f:
    num_lines = sum(1 for line in f)

pbar = tqdm.tqdm(total=num_lines)

# Read the data file and append the utc_timestamp and LIWC categories to the dataframe
with open(data_file_path) as f:
    for line in f:
        data = json.loads(line)
        df = pd.concat([df, pd.DataFrame([[data['created_utc'], count_categories(data['body'])]], columns = ['utc_timestamp', 'LIWC_categories'])])
        pbar.update(1)

# Close the progress bar
pbar.close()

# Save the dataframe to a CSV file
df.to_csv(f'text_liwc_dimensions/{os.path.basename(data_file_path[:-5])}_LIWC.csv', index=False)

df.head()
