# Feature Selection

* Coleman-Liau score (CLScore)
* RIX and LIX indices
* Formality measure (fmeasure)
* Number of uppercase words, presence of questionmarks and exclamation marks in headlines (titles), and the length of the title (number of words) are the most important content features


* The character n-gram features and the word 1-gram feature appear to contribute most to performance
    * Character n-grams are known to capture writing style


* headline: word count
* body: 1. Informality: We compute the frequencies of two informality indicators, namely internet slang and bait words. Additionally, the length of news bodies is also an input feature.


* Sent length, word length, ratio of stop words to content words

In [1]:
# get the data

import json
import os
import pandas as pd

# https://github.com/ipython/ipython/issues/10123
directory_path = os.getcwd()
dataset_no_figures_path = directory_path + '/../data/dataset_no_figures/'

is_clickbait = {}

with open(dataset_no_figures_path + 'truth_train.jsonl') as f:
    for line in f:
        truth = json.loads(line)
        is_clickbait[truth['id']] = 0 if truth['truthClass'] == 'no-clickbait' else 1
        
df = pd.DataFrame()

with open(dataset_no_figures_path + 'instances_train.jsonl') as f:
    for line in f:
        instance = json.loads(line)
        data = pd.DataFrame({'post_text': instance['postText'], 'is_clickbait': is_clickbait[instance['id']]}, index=[instance['id']])
        df = df.append(data)
        
print(df)

                                               post_text  is_clickbait
0      Apple's iOS 9 'App thinning' feature will give...             0
1      RT @kenbrown12: Emerging market investors are ...             0
2      U.S. Soccer should start answering tough quest...             1
3      How theme parks like Disney World left the mid...             0
4      Could light bulbs hurt your health? One compan...             1
5      13 classic ’00s songs that were actually meant...             1
6      Dez Bryant is reportedly considering skipping ...             0
7      Pregnant mother of 12 accused of keeping kids ...             0
8      RT @fionamatthias: 10 ways the expat life Is l...             0
9      House #GOP plans two days of debate, Friday sh...             0
10     Azeri government behind foreign media ban, say...             0
11     Only one in three of us complain when we are u...             0
12     An open letter to Jerry Seinfeld from a "polit...             0
13    

In [2]:
# preprocess the data

# TODO remove newlines from postText? (e.g., \n in 17560)

In [9]:
# get the features

from collections import Counter
from nltk import ngrams
# TODO
# from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from string import ascii_lowercase, ascii_uppercase
import nltk


# https://stackoverflow.com/questions/10677020/real-word-count-in-nltk
def number_of_words(text):
# TODO
#     regexptokenizer = RegexpTokenizer(r'\w+')
#     words = regexptokenizer.tokenize(text)
    words = word_tokenize(text)
    return len(words)


def character_1_grams(text):
    characters = [c for c in text]
    onegrams = ngrams(characters, 1)
    return [gram for gram in onegrams]


def character_2_grams(text):
    if len(text) == 0:
        return []
    characters = [c for c in text]
    twograms = ngrams(characters, 2)
    return [gram for gram in twograms]


def character_3_grams(text):
    if len(text) <= 1:
        return []
    characters = [c for c in text]
    threegrams = ngrams(characters, 3)
    return [gram for gram in threegrams]


# https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
def clindex(text):
    text_lower = text.lower()
    number_of_letters = 0
    for character in text_lower:
        if character in ascii_lowercase:
            number_of_letters += 1
    number_of_sentences = len(sent_tokenize(text))
    n_of_words = number_of_words(text)
    l = 0
    s = 0
    # TODO should l and s be 0?
    if n_of_words == 0:
        pass
    else:
        # l = Letters ÷ Words × 100
        l = number_of_letters / n_of_words * 100
        # s = Sentences ÷ Words × 100
        s = number_of_sentences / n_of_words * 100
    return 0.0588 * l - 0.296 * s - 15.8


# https://stackoverflow.com/questions/10674832/count-verbs-nouns-and-other-parts-of-speech-with-pythons-nltk
def formality_measure(text):
    tokenized_text = nltk.word_tokenize(text.lower())
    t = nltk.Text(tokenized_text)
    pos_tags = nltk.pos_tag(t)
    counts = Counter(tag for word,tag in pos_tags)
    return (counts['NN'] + counts['NNP'] + counts['NNS'] + counts['JJ'] + counts['JJR'] + counts['JJS'] + counts['IN'] + counts['DT'] + counts['PDT'] + counts['WDT'] - counts['PRP'] - counts['PRP$'] - counts['WP'] - counts['WP$'] - counts['VB'] - counts['VBD'] - counts['VBG'] - counts['VBN'] - counts['VBP'] - counts['VBZ'] - counts['RB'] - counts['RBR'] - counts['RBS'] - counts['WRB'] - counts['UH'] + 100) / 2


def is_exclamation_question_mark_present(text):
    return 0 if '!' not in text and '?' not in text else 1


def lix(text):
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return number_of_words(text) / len(sent_tokenize(text))


def number_of_uppercase_words(text):
    words = word_tokenize(text)
    n_of_uppercase_words = 0
    for word in words:
        if word[0] in ascii_uppercase:
            n_of_uppercase_words += 1
    return n_of_uppercase_words


def rix(text):
    lw = 0
    words = word_tokenize(text)
    for word in words:
        if len(word) >= 7:
            lw += 1
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return lw / len(sent_tokenize(text))


def word_1_grams(text):
    onegrams = ngrams(word_tokenize(text), 1)
    return [gram for gram in onegrams]


df['character_1_grams'] = None
df['character_2_grams'] = None
df['character_3_grams'] = None
df['clindex'] = None
df['formality_measure'] = None
df['is_exclamation_question_mark_present'] = None
df['lix'] = None
df['number_of_uppercase_words'] = None
df['number_of_words'] = None
df['rix'] = None
df['word_1_grams'] = None
for i in df.index:
    df.at[i, 'character_1_grams'] = character_1_grams(df.loc[i]['post_text'])
    df.at[i, 'character_2_grams'] = character_2_grams(df.loc[i]['post_text'])
    df.at[i, 'character_3_grams'] = character_3_grams(df.loc[i]['post_text'])
    df.at[i, 'clindex'] = clindex(df.loc[i]['post_text'])
    df.at[i, 'formality_measure'] = formality_measure(df.loc[i]['post_text'])
    df.at[i, 'is_exclamation_question_mark_present'] = is_exclamation_question_mark_present(df.loc[i]['post_text'])
    df.at[i, 'lix'] = lix(df.loc[i]['post_text'])
    df.at[i, 'number_of_uppercase_words'] = number_of_uppercase_words(df.loc[i]['post_text'])
    df.at[i, 'number_of_words'] = number_of_words(df.loc[i]['post_text'])
    df.at[i, 'rix'] = rix(df.loc[i]['post_text'])
    df.at[i, 'word_1_grams'] = word_1_grams(df.loc[i]['post_text'])
print(df)

                                               post_text  is_clickbait  \
0      Apple's iOS 9 'App thinning' feature will give...             0   
1      RT @kenbrown12: Emerging market investors are ...             0   
2      U.S. Soccer should start answering tough quest...             1   
3      How theme parks like Disney World left the mid...             0   
4      Could light bulbs hurt your health? One compan...             1   
5      13 classic ’00s songs that were actually meant...             1   
6      Dez Bryant is reportedly considering skipping ...             0   
7      Pregnant mother of 12 accused of keeping kids ...             0   
8      RT @fionamatthias: 10 ways the expat life Is l...             0   
9      House #GOP plans two days of debate, Friday sh...             0   
10     Azeri government behind foreign media ban, say...             0   
11     Only one in three of us complain when we are u...             0   
12     An open letter to Jerry Seinfel

In [4]:
# normalize features

# TODO get features in range of [0,1] or [-1,1]?
# TODO convert all booleans to 0 and 1?
# TODO convert lix to five levels (0-4) based on this: very easy (0-24), easy (25-34), standard (35-44), difficult (45-54) and very difficult (more than 55)
# TODO convert rix to thirteen levels (0-13) based on this: 0.2, 0.5, 0.8, 1.3, 1.8, 2.4, 3.0, 3.7, 4.5, 5.3, 6.2, 7.2

In [5]:
# create train and test sets

def create_train_test(df):
    test_ratio = 0.2

    test_set_size = int(len(df) * test_ratio)

    train_set = df[:len(df) - test_set_size]
    test_set = df[len(df) - test_set_size:]

#     print('train_set length:', len(train_set))
#     print('test_set length:', len(test_set))

    test_set = test_set.drop('is_clickbait', 1)
    
    return train_set, test_set

train_set, test_set = create_train_test(df)

In [6]:
# create error functions

In [7]:
# test models

# TODO try https://xgboost.ai