In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
from nltk.probability import FreqDist
from wordcloud import WordCloud
pd.set_option('display.max_colwidth',100)

In [2]:
df = pd.read_csv('Data/Fake_Real_News_Data.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during ...,REAL
1,1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative refor...",REAL
2,2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on Octob...,FAKE
3,3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International ...,REAL
4,4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under...",REAL


In [3]:
df = df.drop(columns='Unnamed: 0')

In [4]:
df.duplicated().sum().sum()

29

In [5]:
df = df.drop_duplicates()
df.duplicated().sum().sum()

0

In [6]:
df['lower_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,title,text,label,lower_text
0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during ...,REAL,donald trump endorsed an unabashedly noninterventionist approach to world affairs monday during ...
1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative refor...",REAL,"while some justice department investigations are adversarial, a new model of collaborative refor..."
2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on Octob...,FAKE,trump proudly declares: most of the people i’ve insulted deserved it by andrew bradford on octob...
3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International ...,REAL,donald trump spent a day in january 2014 hobnobbing with politicians at the trump international ...
4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under...",REAL,"notable names include ray washburne (commerce), a dallas-based investor, is reported to be under..."


In [7]:
nltk.download('punkt')

df['tokens'] = df['lower_text'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,title,text,label,lower_text,tokens
0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during ...,REAL,donald trump endorsed an unabashedly noninterventionist approach to world affairs monday during ...,"[donald, trump, endorsed, an, unabashedly, noninterventionist, approach, to, world, affairs, mon..."
1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative refor...",REAL,"while some justice department investigations are adversarial, a new model of collaborative refor...","[while, some, justice, department, investigations, are, adversarial, ,, a, new, model, of, colla..."
2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on Octob...,FAKE,trump proudly declares: most of the people i’ve insulted deserved it by andrew bradford on octob...,"[trump, proudly, declares, :, most, of, the, people, i, ’, ve, insulted, deserved, it, by, andre..."
3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International ...,REAL,donald trump spent a day in january 2014 hobnobbing with politicians at the trump international ...,"[donald, trump, spent, a, day, in, january, 2014, hobnobbing, with, politicians, at, the, trump,..."
4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under...",REAL,"notable names include ray washburne (commerce), a dallas-based investor, is reported to be under...","[notable, names, include, ray, washburne, (, commerce, ), ,, a, dallas-based, investor, ,, is, r..."


In [8]:
nlp_model = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp_model.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

In [9]:
doc = nlp_model('text')
type(doc)

spacy.tokens.doc.Doc

In [None]:
def spacy_process(lower_text):
        """Lemmatize tokens, lower case, remove punctuation, spaces, and stop words"""
        doc = nlp_model(lower_text)
        processed_doc = [token.lemma_
                         for token in doc if not token.is_punct and not token.is_stop]
        return processed_doc

## process the tweets using the spacy function
df['lemmas'] = df['lower_text'].apply(spacy_process)
df.head()

In [None]:
df['lemmas_joined'] = df['lemmas'].map(lambda x: " ".join(x))
df.head()

In [None]:
type(df['lemmas'][0])

In [None]:
df['length'] = df['tokens'].map(len)
df.head(2)

In [None]:
df['label'].value_counts(normalize=True)

In [None]:
real = df.loc[df['label'] == 'REAL']
fake = df.loc[df['label'] == 'FAKE']
print('real news')
display(real.head())
print('fake news')
display(fake.head())

In [None]:
ax = sns.barplot(data=df, x='label', y='length');
ax.set_title('Comparing Lengths of Texts')

plt.show()

real_len = real['length'].median()
fake_len = fake['length'].median()
print(f'The median character length for real news is {real_len} and {fake_len} for fake news.')

In [None]:
real_words = real['lemmas'].explode().to_list()
real_freq_dist = FreqDist(real_words)
fake_words = fake['lemmas'].explode().to_list()
fake_freq_dist = FreqDist(fake_words)

## Plot the distribution
real_freq_dist.plot(20, title='Real News Lemmatized Text Frequency Distribution')

fake_freq_dist.plot(20, title='Fake News Lemmatized Text Frequency Distribution');

In [None]:
def plot_wordclouds(high_cloud, low_cloud, title='Comparing Word Usage'):
    """Plots the wordlcouds for our two groups"""
    ## Plot the Images
    fig, axes = plt.subplots(ncols=2, figsize=(10, 5))
    axes[0].imshow(high_cloud)
    axes[0].set_title('High Ratings')
    axes[0].axis('off')
    
    axes[1].imshow(low_cloud)
    axes[1].set_title('Low Ratings')
    axes[1].axis('off')
    fig.tight_layout()
    
    fig.suptitle(title,y=1.0, fontsize='xx-large');
    return fig

In [None]:
real_lemmas = ' '.join(real_words)
fake_lemmas = ' '.join(fake_words)

In [None]:
wordcloud_kws = dict(min_word_length=2,width=800, height=600, random_state=42)
real_cloud = WordCloud(**wordcloud_kws, colormap='Greens').generate(real_lemmas)
fake_cloud = WordCloud(**wordcloud_kws, colormap='Reds').generate(fake_lemmas)

## Plot the Images
fig = plot_wordclouds(real_cloud, fake_cloud, title='Comparing Lemmas')