In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


from wordcloud import WordCloud, STOPWORDS

import string
from string import punctuation
from matplotlib import style

### In this section, I merged all of my datasets and created:
1. 2019/2020 CB vs Non-CB headline dataset (20,000)
    - non-CB data pulled from API and webscraped from NY Times, The Washington Post, The Guardian, Bloomberg, and Reuters 
    - CB data pulled from 6 Twitter profiles (Buzzfeed, Examiner, ThePoliticalInsider, Upworthy, BoredPanda, The Odyssey
    - features - text, class, date
2. 2007 - 2016 CB vs Non-CB headline dataset (30,000)
    - from Kaggle 
    - Non-CB: NY Times, The Guardian, The Hindu, Wikinews
    - CB: BuzzFeed, Upworthy, ViralNova, Thatscoop, Scoopwhoop and ViralStories
    - text, class
3. One total dataset combining both (50,000)
    - text headlines, class

In [5]:
#reading in datasets
clickbait_df=pd.read_csv('clickbait_final.csv',index_col=0)
nytimes_df=pd.read_csv('nytimes_data_final.csv',index_col=0)
guardian_df=pd.read_csv('guardian_headlines.csv',index_col=0)
twp_df=pd.read_csv('TWP_scraped.csv',index_col=0)
reuters_df=pd.read_csv('reuters_100.csv',index_col=0)
bloomberg_df=pd.read_csv('bloomberg_100.csv',index_col=0)

In [56]:
#labeling class to 1 for all clickbait headlines
clickbait_df['class']=1
clickbait_df.shape

(11116, 3)

In [57]:
#labeling 'class' to 0 for non clickbait headlines
nytimes_df['class']=0
nytimes_df.shape

(5299, 3)

In [58]:
guardian_df['class']=0
guardian_df.shape

(3400, 3)

In [36]:
twp_df.rename(columns={'title':"text",'published':'date'},inplace=True)
twp_df['class']=0

In [37]:
reuters_df['class']=0

In [38]:
bloomberg_df['class']=0

In [39]:
all_headlines_df=pd.concat([clickbait_df,nytimes_df,guardian_df,twp_df,reuters_df,bloomberg_df])

In [54]:
all_headlines_df.shape

(20172, 3)

In [60]:
#all_headlines_df.to_csv('2019_2020_all_headlines.csv')

In [71]:
all_headlines_df.drop(columns='date',inplace=True)

In [61]:
dataset2=pd.read_csv('2007_2016_headline_data.csv')

In [68]:
dataset2.rename(columns={'headline':'text','clickbait':'class'},inplace=True)

In [69]:
dataset2

Unnamed: 0,text,class
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [72]:
df = pd.concat([all_headlines_df,dataset2])

In [74]:
df.shape

(52172, 2)

In [75]:
df['class'].value_counts()

1    27115
0    25057
Name: class, dtype: int64

In [79]:
#df.to_csv('total_headlines.csv')


In [593]:
df['class'].value_counts()

1    27115
0    25057
Name: class, dtype: int64

### In this section, I process the dataset for EDA and create additional features: 

In [664]:
df = pd.read_csv('total_headlines.csv')

In [10]:
up_df=pd.read_csv('../data/upworthy-archive.csv')
up_df['class']=1
up_df['text']=up_df['package_headline'].astype(str).apply(lambda x: x.lower())

In [62]:
gen_df=pd.read_csv('../data/nytstuff.csv')
gen_df2=pd.read_csv('../data/with_engineeredfeat_data.csv')
gen_df2['class']=0
gen_df2['text']=gen_df['title'].astype(str).apply(lambda x: x.lower())
gen_df2

Unnamed: 0.1,Unnamed: 0,text,class,question,exclamation,starts_with_num,headline_words
0,0,"betsy devos confirmed as education secretary, ...",0,0,0,0,10
1,1,melania trump says white house could mean mill...,0,0,0,1,7
2,2,"as trump fears fraud, gop eliminates election ...",0,0,1,0,22
3,3,appeals court to decide on challenge to trump'...,0,0,0,0,13
4,4,at least 4 tornadoes reported in southeast lou...,0,0,0,0,19
...,...,...,...,...,...,...,...
52122,52167,,0,0,0,0,10
52123,52168,,0,0,0,0,9
52124,52169,,0,0,0,0,12
52125,52170,,0,0,0,0,9


In [665]:
#make text lowercase
df['text']=df['text'].apply(lambda x: x.lower())

In [20]:
#function to remove punctuation and non-alphabetical characters and links
import re
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    #text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('“','',text)
    text = re.sub('”','',text)
    text = re.sub('’','',text)
    text = re.sub('–','',text)
    text = re.sub('‘','',text)
    
    return text

In [11]:
#function to find if string contains a question and if so, update new feature with a 1 for yes or 0 for no
question_words = ['who','what','where','why','when','whose','whom','would','will','how','which','should','could']

def contains_question(headline):
    if not headline:
        return 0
    elif "?" in headline or headline.startswith(('who','what','where','why','when','whose','whom','would','will','how','which','should','could','did','do')):
        return int(1)
    else: 
        return int(0)

df['question_6']=df['text'].apply(contains_question)


NameError: name 'df' is not defined

In [13]:
up_df['question']=up_df['text'].apply(contains_question)
up_df['question'].value_counts()

0    118408
1     32409
Name: question, dtype: int64

In [74]:
def contains_question(headline):
    if not headline:
        return 0
    elif "?" in headline or headline.startswith(('who','what','where','why','when','whose','whom','would','will','how','which','should','could','did','do')):
        return int(1)
    else: 
        return int(0)

gen_df2['question']=gen_df['title'].astype(str).apply(contains_question)
gen_df2 = gen_df2.dropna(subset=['question'])
gen_df2['question'] = gen_df2['question'].astype(int)
gen_df2['question'].value_counts()

0    3604
1     220
Name: question, dtype: int64

In [332]:
#test
#df['text'][0].startswith(('who','what','where','why','when','whose','whom','would','will','how','which','should','could','trey'))

In [667]:
df.rename(columns={'question_6':'question'},inplace=True)

In [668]:
df['question'].value_counts()

0    47388
1     4784
Name: question, dtype: int64

In [669]:
#create function to find if headline contains '!' and create new feature with 1 for yes and 0 for no
def contains_exclamation(headline):
    if "!" in headline: 
        return 1
    else: 
        return 0
df['exclamation']=df['text'].apply(contains_exclamation)

In [76]:
def contains_exclamation(headline):
    if "!" in headline: 
        return 1
    else: 
        return 0

gen_df2['exclamation']=gen_df['title'].astype(str).apply(contains_exclamation)
gen_df2 = gen_df2.dropna(subset=['exclamation'])
gen_df2['exclamation'] = gen_df2['exclamation'].astype(int)
gen_df2['exclamation'].value_counts()

0    3820
1       4
Name: exclamation, dtype: int64

In [52]:
def contains_exclamation(headline):
    if "!" in headline: 
        return 1
    else: 
        return 0

gen_df['exclamation']=gen_df['text'].apply(contains_exclamation)
gen_df['exclamation'].value_counts()

0    3531
1     293
Name: exclamation, dtype: int64

In [670]:
df.exclamation.value_counts()

0    51614
1      558
Name: exclamation, dtype: int64

In [671]:
#clean headlines to remove puncuation and links
#create feature to count words of each (before removing stop words and numbers)
#create feature to count if headline starts with a digit or not

feature_engineering_clean = lambda x: clean_text_round1(x)
df.text = pd.DataFrame(df.text.apply(feature_engineering_clean))


#create function to find if headline starts with a digit and create new feature with 1 for yes and 0 for no
def starts_with_num(headline):
    if headline.startswith(('1','2','3','4','5','6','7','8','9')): 
        return 1
    else: 
        return 0

df['starts_with_num']=df['text'].apply(starts_with_num)

In [22]:
#create function to find if headline starts with a digit and create new feature with 1 for yes and 0 for no
def starts_with_num(headline):
    if headline.startswith(('1','2','3','4','5','6','7','8','9')): 
        return 1
    else: 
        return 0

feature_engineering_clean = lambda x: clean_text_round1(x)
up_df.text = pd.DataFrame(up_df.text.apply(feature_engineering_clean))
up_df['starts_with_num']=up_df['text'].apply(starts_with_num)

up_df.text

0         lets see … hire cops pay teachers buy books fo...
1         people sent this lesbian questions and her rai...
2         3 million is what it takes for a state to lega...
3         the fact that sometimes innocent people are ex...
4         reason 351 to end the death penalty it costs 3...
                                ...                        
150812    an artist animates the ugly truth we should al...
150813    theyre being called walmarts worst nightmare a...
150814    theyre being called walmarts worst nightmare a...
150815    theyre being called walmarts worst nightmare a...
150816    theyre being called walmarts worst nightmare a...
Name: text, Length: 150817, dtype: object

In [80]:
#create function to find if headline starts with a digit and create new feature with 1 for yes and 0 for no
def starts_with_num(headline):
    if headline.startswith(('1','2','3','4','5','6','7','8','9')): 
        return 1
    else: 
        return 0

feature_engineering_clean = lambda x: clean_text_round1(x)
gen_df2.text = pd.DataFrame(gen_df.title.astype(str).apply(feature_engineering_clean))
gen_df2['starts_with_num']=gen_df['title'].astype(str).apply(starts_with_num)

gen_df2

Unnamed: 0.1,Unnamed: 0,text,class,question,exclamation,starts_with_num,headline_words
0,0,betsy devos confirmed as education secretary w...,0,0,0,0,10
1,1,melania trump says white house could mean mill...,0,0,0,0,7
2,2,as trump fears fraud gop eliminates election c...,0,0,0,0,22
3,3,appeals court to decide on challenge to trumps...,0,0,0,0,13
4,4,at least 4 tornadoes reported in southeast lou...,0,0,0,0,19
...,...,...,...,...,...,...,...
3819,3819,12 applications for representation at disclosu...,0,0,0,1,17
3820,3820,conditions preventing attempts to lift rescue ...,0,0,0,0,16
3821,3821,children in northern ireland eating treble rec...,0,0,0,0,17
3822,3822,call for united nations disabilities conventio...,0,0,0,0,16


In [672]:
df.drop(columns='Unnamed: 0',inplace=True)

In [673]:
#create feature that counts words in each headline
df['headline_words'] = df['text'].apply(lambda x: len(x.split()))

In [25]:
up_df['headline_words'] = up_df['text'].apply(lambda x: len(x.split()))

In [82]:
gen_df2['headline_words'] = gen_df['title'].astype(str).apply(lambda x: len(x.split()))

In [675]:
df = df[df['headline_words'] != 0]
df

Unnamed: 0,text,class,question,exclamation,starts_with_num,headline_words
0,trey gowdy just humiliated adam schiff in fron...,1,0,0,0,10
1,60 netflix titles leaving in july 2020,1,0,0,1,7
2,learn how to make a green grape taste like a j...,1,0,1,0,22
3,the new july netflix titles are here and there...,1,0,0,0,13
4,the courts say sex discrimination laws protect...,1,0,0,0,19
...,...,...,...,...,...,...
52167,to make female hearts flutter in iraq throw a ...,0,0,0,0,10
52168,british liberal democrat patsy calton 56 dies ...,0,0,0,0,9
52169,drone smartphone app to help heart attack vict...,0,0,0,0,12
52170,netanyahu urges pope benedict in israel to den...,0,0,0,0,9


In [676]:
df.to_csv('with_engineeredfeat_data.csv')

In [92]:
# up_df.drop(columns='created_at',inplace=True)
# up_df.drop(columns='updated_at',inplace=True)
# up_df.drop(columns='ab_test_id',inplace=True)
# up_df.drop(columns='excerpt',inplace=True)
# up_df.drop(columns='package_headline',inplace=True)
# up_df.drop(columns='lede',inplace=True)
# up_df.drop(columns='slug',inplace=True)
# up_df.drop(columns='package_picture_id',inplace=True)
# up_df.drop(columns='impressions',inplace=True)
# up_df.drop(columns='clicks',inplace=True)
# up_df.drop(columns='score',inplace=True)
# up_df.drop(columns='first_place',inplace=True)
# up_df.drop(columns='winner',inplace=True)
# up_df.drop(columns='share_text',inplace=True)
# up_df.drop(columns='share_img',inplace=True)
# up_df.drop(columns='test_week',inplace=True)
up_df2=pd.read_csv('../data/upworthy-archive.csv')
up_df['winner']=up_df2['winner']
up_df = up_df[up_df.winner == True]
up_df.drop(columns='winner',inplace=True)
up_df.drop(columns='Unnamed: 0',inplace=True)
up_df.to_csv('../data/cleaned_up.csv')

KeyError: "['Unnamed: 0'] not found in axis"

In [83]:
gen_df2.to_csv('../data/clean_gen.csv')
gen_df2

Unnamed: 0.1,Unnamed: 0,text,class,question,exclamation,starts_with_num,headline_words
0,0,betsy devos confirmed as education secretary w...,0,0,0,0,12
1,1,melania trump says white house could mean mill...,0,0,0,0,10
2,2,as trump fears fraud gop eliminates election c...,0,0,0,0,8
3,3,appeals court to decide on challenge to trumps...,0,0,0,0,11
4,4,at least 4 tornadoes reported in southeast lou...,0,0,0,0,8
...,...,...,...,...,...,...,...
3819,3819,12 applications for representation at disclosu...,0,0,0,1,7
3820,3820,conditions preventing attempts to lift rescue ...,0,0,0,0,8
3821,3821,children in northern ireland eating treble rec...,0,0,0,0,9
3822,3822,call for united nations disabilities conventio...,0,0,0,0,9
