You are provided with starter code that loads the data, extracts a set of features, and then trains a Naïve Bayes classifier using those features and outputs the classifier accuracy.  Your job is to extract additional feature sets using NLTK and report the classifier performance for each set of features.

In [48]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import urllib.request
from nltk.corpus import stopwords
import nltk
import re
import numpy as np
from collections import Counter
from sklearn import preprocessing
import string

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [49]:
# load the data
non_clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/non_clickbait_data.txt"
clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/clickbait_data.txt"

# read url .txt file into string "data"
def get_data(url):
  data = urllib.request.urlopen(url).read().decode('utf-8')
  return data

non_clickbait_data = get_data(non_clickbait_url)
clickbait_data = get_data(clickbait_url)

In [51]:
# combine clickbait and non-clickbait data in a single list
non_clickbait_headlines = non_clickbait_data.rstrip('\n').split('\n')
clickbait_headlines = clickbait_data.rstrip('\n').split('\n')
all_headlines = non_clickbait_headlines + clickbait_headlines

In [52]:
# create a list of corresponding labels
non_cb_labels = [0] * len(non_clickbait_headlines)
cb_labels = [1] * len(clickbait_headlines)
all_labels = non_cb_labels + cb_labels

1: Stop words: counts for each function word (from the NLTK stopwords list)

In [53]:
def stop_words(texts):
        bow = []
        eng_stopwords = stopwords.words('english')
        for text in texts:      
                counts = []
                tokens = nltk.word_tokenize(text.lower())
                for sw in eng_stopwords:
                        sw_count = tokens.count(sw)
                        counts.append(sw_count)
                bow.append(counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

stop_words_features = stop_words(all_headlines)
print(stop_words_features)

# convert features and labels to numpy arrays
X = stop_words_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.8735535323538606


2: Syntactic: counts for the following 10 common POS tags -- ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']


In [54]:
def syntactic(texts):
        bow = []
        POS = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
        for text in texts:  
                counts = Counter({})
                current_counts = []
                tokens = nltk.word_tokenize(text.lower())
                for tag in POS:
                        tag_count = []
                        pos = nltk.pos_tag(tokens)
                        v = 0
                        counts = Counter(tag for _, tag in pos)
                        for key, value in counts.items():
                          if key == tag:
                            v = value
                        current_counts.append(v)
                bow.append(current_counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

syntactic_features = syntactic(all_headlines)
print(syntactic_features)

# convert features and labels to numpy arrays
X = syntactic_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

[[4. 0. 0. ... 0. 1. 1.]
 [2. 0. 1. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 1.]
 ...
 [1. 0. 1. ... 0. 0. 2.]
 [2. 0. 1. ... 0. 0. 0.]
 [4. 0. 2. ... 0. 0. 0.]]
0.7672044584245077


3: Lexical: counts for 30 most common unigrams in entire corpus (remove stopwords and punctuation for unigram count)

In [55]:
def lexical(texts):
        eng_stopwords = stopwords.words('english')
        punctuation = set(string.punctuation)
        mostcommon = []
        for text in texts:      
                counts = []
                tokens = nltk.word_tokenize(text.lower())
                remove_stopwords = [word for word in tokens if word not in eng_stopwords]
                remove_punct = [word for word in remove_stopwords if word not in punctuation]
                mostcommon.append(remove_punct)
        flat_lex = [item for sublist in mostcommon for item in sublist]
        mostCommon = nltk.FreqDist(flat_lex).most_common(30)

        bow = []
        mostCommon = [x[0] for x in mostCommon]
        for text in texts:
              counts = []
              tokens = nltk.word_tokenize(text.lower())
              for wrd in mostCommon:
                  wrd_count = tokens.count(wrd)
                  counts.append(wrd_count)
              bow.append(counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

lexical_features = lexical(all_headlines)
print(lexical_features)

# convert features and labels to numpy arrays
X = lexical_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.7349466337136605


4. Punctuation:  Counts for each punctuation mark in string.punctuation


In [56]:
def punctuation(texts):
        bow = []
        punctuation = set(string.punctuation)
        for text in texts:      
                counts = []
                tokens = nltk.word_tokenize(text.lower())
                for punct in  punctuation:
                        punct_count = tokens.count(punct)
                        counts.append(punct_count)
                bow.append(counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

punctuation_features = punctuation(all_headlines)
print(punctuation_features)

# convert features and labels to numpy arrays
X = punctuation_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.5012524812441388


5: Complexity: average number of characters per word #unique words/#total words number of words Count of “long” words - words with >= 6 letters

In [57]:
def avg_char_per_word(text):
        avgChar = []
        tokens = nltk.word_tokenize(text)
        for word in tokens:
            avgChar.append(len(tokens))
        return sum(avgChar) / len(avgChar)

def unique_word_percent(text):
        tokens = nltk.word_tokenize(text)
        return len(set(tokens))/len(tokens)

def num_words(text):
        count = 0
        tokens = nltk.word_tokenize(text)
        for word in tokens:      
              count +=1
        return count

def num_long_words(text):
        count = 0
        tokens = nltk.word_tokenize(text)
        for word in tokens:      
            if len(word) > 5:
              count +=1
        return count

def complexity(texts):
        bow = []
        for text in texts:      
                avgCharPerWord = avg_char_per_word(text)
                uniqueWordPercent = unique_word_percent(text)
                numWords = num_words(text)
                numLongWords = num_long_words(text)
                bow.append([avgCharPerWord, uniqueWordPercent, numWords, numLongWords])
        bow_np = np.array(bow).astype(float)
        return bow_np

complexity_features = complexity(all_headlines)
print(complexity_features)

# convert features and labels to numpy arrays
X = complexity_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

# 10.523035441639866
# 0.08085551695336646
# 307697
# 104454

[[13.          1.         13.          4.        ]
 [ 8.          1.          8.          4.        ]
 [12.          1.         12.          4.        ]
 ...
 [14.          1.         14.          4.        ]
 [ 9.          0.77777778  9.          1.        ]
 [13.          1.         13.          6.        ]]
0.7004498769146609


6. Your own proposed feature set: Think about what other features may be useful for clickbait identification and implement them.  You can get ideas from this paper: https://arxiv.org/pdf/1610.09786.pdf  

In [58]:
def possesive(texts):
        bow = []
        possesive_words = ["of", "'s",  "my", "mine", "your", "yours", "his", "her", "hers", "its", "our", "ours", "their", "theirs"]
        for text in texts:      
                counts = []
                tokens = nltk.word_tokenize(text.lower())
                for word in possesive_words:
                        word_count = tokens.count(word)
                        counts.append(word_count)
                bow.append(counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

possesive_features = possesive(all_headlines)
print(possesive_features)

# convert features and labels to numpy arrays
X = possesive_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.613316807986871


Combine all features!

In [59]:
def combined(texts):
        bow = []    
        stop_words_features = stop_words(texts)
        syntactic_features = syntactic(texts)
        lexical_features = lexical(texts)
        punctuation_features = punctuation(texts)
        complexity_features = complexity(texts)
        possesive_features = possesive(texts)
        bow_np = np.concatenate((stop_words_features, syntactic_features, lexical_features, punctuation_features, complexity_features, possesive_features), axis=1)
        return bow_np

combined_features = combined(all_headlines)
print(combined_features)

# convert features and labels to numpy arrays
X = combined_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.9185888754298219
