# Import required libraries

In [1]:
import numpy as np
import pandas as pd

# For text processing, cleaning
import nltk
from nltk.tag import pos_tag, map_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import text2emotion as te
import re
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import dataset

In [2]:
# Import data
news_data = pd.read_csv("train.csv")
news_data.set_index("id", inplace=True)

In [3]:
news_data.shape

(20800, 4)

In [4]:
news_data.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
# Information about the Variables
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 812.5+ KB


In [6]:
news_data.dtypes

title     object
author    object
text      object
label      int64
dtype: object

# Data Cleaning (Handling of null rows)

In [7]:
# Check if any of the columns contains NULL value
news_data.isnull().sum(axis=0)

title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# For text null rows, replace null with title
news_data.loc[news_data["text"].isnull() & ~news_data["title"].isnull(),'text'] = news_data["title"]

# For title null rows, get first 50 characters of text as title
news_data.loc[news_data["title"].isnull() & ~news_data["text"].isnull(),'title'] = news_data["text"].str[:50]

# For the null authors, assign "Unknown" as author
news_data.loc[news_data["author"].isnull(), 'author'] = "Unknown"

In [9]:
# Drop rest of the rows with NULL values
news_data.dropna(inplace=True)

# Check how many rows remain
news_data.shape

(20800, 4)

## Some data standardization before EDA (For better presentation)

In [10]:
# Add new column values to real/fake classes
news_data['label_translated'] = np.where(news_data['label']==1, 'fake', 'not fake')
news_data.head()

Unnamed: 0_level_0,title,author,text,label,label_translated
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,fake
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,not fake
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,fake
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,fake
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,fake


# Generate extra data using text processing techniques (Part 1)

## Character & Word count of Title & Text columns

In [11]:
# Add a column for the number of char for title and text
news_data['title_charcount'] = news_data['title'].str.len()
news_data['text_charcount'] = news_data['text'].str.len()

# Add a column for the number of word for title and text
news_data['title_wordcount'] = news_data['title'].str.split().str.len()
news_data['text_wordcount'] = news_data['text'].str.split().str.len()

news_data.head()

Unnamed: 0_level_0,title,author,text,label,label_translated,title_charcount,text_charcount,title_wordcount,text_wordcount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,fake,81,4930,14,820
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,not fake,55,4160,9,710
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,fake,33,7692,7,1266
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,fake,63,3237,10,557
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,fake,93,938,14,154


# Data cleaning

## Removal of unnecessary data (Symbols & Numbers) 

In [12]:
# Function to remove symbols & numbers (Non useful data)
def remove_symbols_numbers(string):
    # Use regex to replace for anything that is not alphabets and punctuation marks
    string = re.sub('[^a-zA-Z!?\']', ' ', string)
    # Eliminate multiple spaces
    return " ".join(string.split())

In [13]:
# Apply all functions to whole data
news_data['text'] = news_data['text'].apply(remove_symbols_numbers)
# Do the same for title
news_data['title'] = news_data['title'].apply(remove_symbols_numbers)

## Removal of stopwords & generate stopwords count

In [14]:
# Download stopwords data
nltk.download('stopwords')

# Assign to variable
my_stopwords = stopwords.words('english')

# Add some additional stopwords
my_stopwords.extend(["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", 
                     "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", 
                     "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", 
                     "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", 
                     "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", 
                     "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", 
                     "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", 
                     "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", 
                     "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", 
                     "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", 
                     "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", 
                     "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", 
                     "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", 
                     "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"
                    ])
# Set to remove duplicates and re-assign to variable
# Print to see total list of stopwords
print(my_stopwords:=set(my_stopwords))

{'haven', 'for', "she'd", "we're", 'why', 't', 'most', 'some', 'are', "weren't", 'but', "isn't", 'herself', 'between', 'out', 'into', 'few', 'their', 'each', 'below', 'her', 'what', 'through', 'm', 'until', 'at', 've', "you'd", 'where', 'o', "who's", 'all', "hadn't", 'isn', 'were', 'wasn', "wouldn't", "that's", 'by', 'doesn', 'under', 's', 'being', 'had', 'further', 'won', "we'll", 'd', 'weren', 'very', 'there', 'those', 'because', 'and', 'shan', "aren't", "what's", "you're", 'hadn', 'too', 'ma', 'shouldn', 'here', 'am', 'been', 'as', 'didn', 'on', 'how', 'once', 'just', 'so', "you've", 'should', 'do', "she'll", "didn't", 'your', 'after', 'mightn', 'would', 'could', 'ours', "shan't", "it's", 'we', 'can', 'did', 'be', 'my', "he's", "they'll", 'over', 'if', 'down', 'other', 'me', 'mustn', "can't", "haven't", "how's", "don't", 'before', "doesn't", 'that', "let's", 'himself', 'you', "wasn't", 'itself', 'while', 'against', 'only', 'these', 'does', 'having', 'aren', "they've", "i'll", "needn

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Function to remove stopwords (Words that do not give any meaning)
def remove_stop_words(string):
    # Convert to lowercase for stopwords matching
    # Tokenize sentence into list
    char_arr = string.lower().split()
    # Record the number of stopwords deleted
    count = sum(char in my_stopwords for char in char_arr)
    # Iterate list of words, only keeping words that are not stopwords
    char_arr = [char for char in char_arr if not char in my_stopwords]
    # Form the sentence back
    return " ".join(char_arr), count

In [16]:
# Apply all functions to whole data
news_data['text'], news_data['stopwords_count_text'] = zip(*news_data['text'].apply(remove_stop_words))
# Do the same for title
news_data['title'], news_data['stopwords_count_title'] = zip(*news_data['title'].apply(remove_stop_words))

## Word stemming

In [17]:
# Init stemming object
ps = PorterStemmer()

# Function to perform stemming (Reducing words to their root form)
def stem(string):
    # Tokenize sentence into list
    char_arr = string.split()
    # Iterate list of words, stemming each word
    char_arr = [ps.stem(char) for char in char_arr]
    # Form the sentence back
    return " ".join(char_arr)

In [18]:
# Apply all functions to whole data
news_data['text'] = news_data['text'].apply(stem)
# Do the same for title
news_data['title'] = news_data['title'].apply(stem)

# Generate extra data using text processing techniques (Part 2)

## Generate sentiment (Polarity)

In [19]:
# Download resources needed
nltk.download('vader_lexicon')

# Init SIA object
sia = SentimentIntensityAnalyzer()

# Function to return the full form of the polarity
def format_polarity(polarity):
    # Return its proper form
    if polarity == "neu":
        return "neutral"
    elif polarity == "pos":
        return "positive"
    elif polarity == "neg":
        return "negative"

# Function to return the polarity & the corresponding confidence score
def get_polarity_score(sentence):
    # Check if sentence is very long (Over 100 words) (Takes too long to process many words, therefore limit to 100)
    if len(sentence.split()) > 100:
        # Only take the first 100 words
        sentence = " ".join(sentence.split()[:100])
    
    # Get polarity & score in dict
    polarity_score = sia.polarity_scores(sentence)
    # Delete compound as we do not require it
    del polarity_score['compound']
    # Sort to highest polarity and get its score
    polarity, score = sorted(polarity_score.items(), key=lambda x: x[1], reverse=True)[:1][0]
    # Format polarity into proper form
    polarity = format_polarity(polarity)
    return polarity, score

# Add 2 columns polarity & score (Title)
news_data['title_polarity'], news_data['title_polarity_score'] = zip(*news_data['title'].apply(get_polarity_score))
                                                       
# Add 2 columns polarity & score (Text)
news_data['text_polarity'], news_data['text_polarity_score'] = zip(*news_data['text'].apply(get_polarity_score))                                             

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Generate emotions

In [20]:
# Function to return the corresponding emotions for any sentence
def get_emotion(sentence):
    # Check if sentence is very long (Over 30 words) (Takes too long to process many words, therefore limit to 30)
    if len(sentence.split()) > 30:
        # Only take the first 30 words
        sentence = " ".join(sentence.split()[:30])
    
    # Get emotion
    emotion_dict = te.get_emotion(sentence)
    # Filter to highest scoring emotion
    emotion, score = sorted(emotion_dict.items(), key=lambda x: x[1], reverse=True)[:1][0]
    # Check if emotion is balanced, therefore neutral
    if score == 0.0:
        return "neutral"
    else:
        return emotion
    
# Add 1 column emotion (Title)
news_data['title_emotion'] = news_data['title'].apply(get_emotion)
                                                       
# Add 1 column emotion (text)
news_data['text_emotion'] = news_data['text'].apply(get_emotion)

## Parts of speech tagging (POS)

In [21]:
# Download required resources for POS
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Function to return new columns for POS count for any sentence
def pos_tagging(sentence):
    # Create temp dict
    pos_dict = {}
    # Tokenize sentence
    sentence_token = sentence.split()
    # Get pos tagging in list
    pos_list = nltk.pos_tag(sentence_token)
    # Simplify
    pos_list = [map_tag('en-ptb', 'universal', tag) for _, tag in pos_list]
    pos_dict = Counter(pos_list)
    return [pos_dict['ADJ'] if 'ADJ' in pos_dict else 0, 
            pos_dict['ADV'] if 'ADV' in pos_dict else 0, 
            pos_dict['NOUN'] if 'NOUN' in pos_dict else 0,
            pos_dict['NUM'] if 'NUM' in pos_dict else 0,
            pos_dict['PRON'] if 'PRON' in pos_dict else 0,
            pos_dict['VERB'] if 'VERB' in pos_dict else 0]

# Add 1 column pos (Title)
news_data['title_pos_adj'], news_data['title_pos_adv'], news_data['title_pos_noun'], news_data['title_pos_num'], news_data['title_pos_pron'], news_data['title_pos_verb'] = zip(*news_data['title'].apply(pos_tagging))
                                                       
# Add 1 column pos (text)
news_data['text_pos_adj'], news_data['text_pos_adv'], news_data['text_pos_noun'], news_data['text_pos_num'], news_data['text_pos_pron'], news_data['text_pos_verb'] = zip(*news_data['text'].apply(pos_tagging))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Aozy\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# Final cleanup

## After cleaning, delete rows which are empty

In [22]:
# Check if any of the columns are empty string "" after cleaning
for col in news_data.columns:
    # Print sum of empty strings in a col
    print(f"{col}:\t\t\t{len(news_data[news_data[col] == ''])}")
    # Replace empty string with NA to clean
    news_data[col].replace('', np.nan, inplace=True)

title:			130
author:			0
text:			102
label:			0
label_translated:			0
title_charcount:			0
text_charcount:			0
title_wordcount:			0
text_wordcount:			0
stopwords_count_text:			0
stopwords_count_title:			0
title_polarity:			0
title_polarity_score:			0
text_polarity:			0
text_polarity_score:			0
title_emotion:			0
text_emotion:			0
title_pos_adj:			0
title_pos_adv:			0
title_pos_noun:			0
title_pos_num:			0
title_pos_pron:			0
title_pos_verb:			0
text_pos_adj:			0
text_pos_adv:			0
text_pos_noun:			0
text_pos_num:			0
text_pos_pron:			0
text_pos_verb:			0


In [23]:
# Check if any of the columns contains NULL value after cleaning
news_data.isnull().sum(axis=0)

title                    130
author                     0
text                     102
label                      0
label_translated           0
title_charcount            0
text_charcount             0
title_wordcount            0
text_wordcount             0
stopwords_count_text       0
stopwords_count_title      0
title_polarity             0
title_polarity_score       0
text_polarity              0
text_polarity_score        0
title_emotion              0
text_emotion               0
title_pos_adj              0
title_pos_adv              0
title_pos_noun             0
title_pos_num              0
title_pos_pron             0
title_pos_verb             0
text_pos_adj               0
text_pos_adv               0
text_pos_noun              0
text_pos_num               0
text_pos_pron              0
text_pos_verb              0
dtype: int64

In [24]:
# Drop rest of the rows with NULL values
news_data.dropna(inplace=True)

# Reset index for dropped columns
news_data.reset_index(inplace=True, drop=True)
# Set index name back to original name
news_data.index.name = 'id'

# Check how many rows remain
news_data.shape

(20584, 29)

In [25]:
# View cleaned data
news_data.head()

Unnamed: 0_level_0,title,author,text,label,label_translated,title_charcount,text_charcount,title_wordcount,text_wordcount,stopwords_count_text,...,title_pos_noun,title_pos_num,title_pos_pron,title_pos_verb,text_pos_adj,text_pos_adv,text_pos_noun,text_pos_num,text_pos_pron,text_pos_verb
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,hous dem aid even see comey letter jason chaff...,Darrell Lucus,hous dem aid even see comey letter jason chaff...,1,fake,81,4930,14,820,406,...,6,0,0,1,102,16,234,3,0,68
1,flynn hillari clinton big woman campu breitbart,Daniel J. Flynn,ever get feel life circl roundabout rather hea...,0,not fake,55,4160,9,710,330,...,5,0,0,1,81,13,208,3,3,45
2,truth might get fire,Consortiumnews.com,truth might get fire octob tension intellig an...,1,fake,33,7692,7,1266,575,...,2,0,0,2,155,33,385,4,3,92
3,civilian kill singl us airstrik identifi,Jessica Purkiss,video civilian kill singl us airstrik identifi...,1,fake,63,3237,10,557,244,...,2,0,1,1,74,5,158,6,12,44
4,iranian woman jail fiction unpublish stori wom...,Howard Portnoy,print iranian woman sentenc six year prison ir...,1,fake,93,938,14,154,64,...,7,0,0,1,13,5,54,2,0,11


# Save cleaned data to disk (Cleaning takes too long)

In [26]:
# Save file (Cleaning too long, around 1h)
news_data.to_csv("cleaned_news_data.csv", sep=',', encoding='utf-8')