In [57]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hilolarustamova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hilolarustamova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hilolarustamova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
# Import our data
df = pd.read_csv('fake_or_real_news.csv')
print(df.shape)
df.head()

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [59]:
#no null values
df.isnull().sum()
#no dublicates
print(df.duplicated().sum())

0


In [60]:
#lowercasing all text, title and label
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()
df['label'] = df['label'].str.lower()


df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,you can smell hillary’s fear,"daniel greenfield, a shillman journalism fello...",fake
1,10294,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,fake
2,3608,kerry to go to paris in gesture of sympathy,u.s. secretary of state john f. kerry said mon...,real
3,10142,bernie supporters on twitter erupt in anger ag...,"— kaydee king (@kaydeeking) november 9, 2016 t...",fake
4,875,the battle of new york: why this primary matters,it's primary day in new york and front-runners...,real


In [74]:
#removing unwanted characters


def remove_unwanted_char(a_string):    
    a_string = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(a_string))
    return a_string

remove_unwanted_char(df['title'])
print(df)

      Unnamed: 0                                              title  \
0           8476                       you can smell hillary’s fear   
1          10294  watch the exact moment paul ryan committed pol...   
2           3608        kerry to go to paris in gesture of sympathy   
3          10142  bernie supporters on twitter erupt in anger ag...   
4            875   the battle of new york: why this primary matters   
...          ...                                                ...   
6330        4490  state department says it can't find emails fro...   
6331        8062  the ‘p’ in pbs should stand for ‘plutocratic’ ...   
6332        8622  anti-trump protesters are tools of the oligarc...   
6333        4021  in ethiopia, obama seeks progress on peace, se...   
6334        4330  jeb bush is suddenly attacking trump. here's w...   

                                                   text label  
0     daniel greenfield, a shillman journalism fello...  fake  
1     google pinter

In [75]:
# Remove all punctuation

def remove_punctuation(a_string):    
    a_string = re.sub(r'[^\w\s]','', str(a_string))
    return a_string

remove_punctuation(df['title'])

'0                            you can smell hillarys fear\n1       watch the exact moment paul ryan committed pol\n2             kerry to go to paris in gesture of sympathy\n3       bernie supporters on twitter erupt in anger ag\n4        the battle of new york why this primary matters\n                                                      \n6330    state department says it cant find emails fro\n6331    the p in pbs should stand for plutocratic \n6332    antitrump protesters are tools of the oligarc\n6333    in ethiopia obama seeks progress on peace se\n6334    jeb bush is suddenly attacking trump heres w\nName title Length 6335 dtype object'

In [76]:
# Remove all stopwords

def remove_stopwords(a_string):
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string
            
remove_stopwords(str(df['title']))

"0 smell hillary ’ fear 1 watch exact moment paul ryan committed pol ... 2 kerry go paris gesture sympathy 3 bernie supporters twitter erupt anger ag ... 4 battle new york : primary matters ... 6330 state department says ca n't find emails fro ... 6331 ‘ p ’ pbs stand ‘ plutocratic ’ ... 6332 anti-trump protesters tools oligarc ... 6333 ethiopia , obama seeks progress peace , se ... 6334 jeb bush suddenly attacking trump . 's w ... Name : title , Length : 6335 , dtype : object"

In [89]:
def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    #input_string = lem_with_pos_tag(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string


df['title_clean'] = df['title']
# df['message_clean'] = df['message_clean'].apply(make_lower)
# df['message_clean'] = df['message_clean'].apply(remove_punctuation)
# df['message_clean'] = df['message_clean'].apply(lem_with_pos_tag)
# df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df['title_clean'] = df['title'].apply(text_pipeline)

print("ORIGINAL TEXT:", df['title'])
print("CLEANDED TEXT:", df['title_clean'])

ORIGINAL TEXT: 0                            you can smell hillary’s fear
1       watch the exact moment paul ryan committed pol...
2             kerry to go to paris in gesture of sympathy
3       bernie supporters on twitter erupt in anger ag...
4        the battle of new york: why this primary matters
                              ...                        
6330    state department says it can't find emails fro...
6331    the ‘p’ in pbs should stand for ‘plutocratic’ ...
6332    anti-trump protesters are tools of the oligarc...
6333    in ethiopia, obama seeks progress on peace, se...
6334    jeb bush is suddenly attacking trump. here's w...
Name: title, Length: 6335, dtype: object
CLEANDED TEXT: 0                                     smell hillarys fear
1       watch exact moment paul ryan committed politic...
2                         kerry go paris gesture sympathy
3       bernie supporters twitter erupt anger dnc trie...
4                         battle new york primary matters
 

In [92]:
# Define our `X` and `y` data. 

X = df['title_clean'].values

y = df['label'].values

In [93]:
# Split our data into testing and training like always. 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# Save the raw text for later just incase
X_train_text = X_train
X_test_text = X_test

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize our vectorizer
vectorizer = TfidfVectorizer()

# This makes your vocab matrix
vectorizer.fit(X_train)

# This transforms your documents into vectors.
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, type(X))

(5068, 9341) <class 'numpy.ndarray'>


In [96]:
title_clean = vectorizer.get_feature_names()
label = vectorizer.idf_

print(len(title_clean), len(label))

df_idf = pd.DataFrame.from_dict( {'title_clean': title_clean, 'label': label})

df_idf = df_idf.sort_values(by='label', ascending=False)

df_idf

9341 9341


Unnamed: 0,title_clean,label
0,01,8.837752
5338,millennial,8.837752
5358,minnesota,8.837752
5357,minneapolis,8.837752
5354,mining,8.837752
...,...,...
5657,new,4.071313
8837,us,4.005446
4029,hillary,3.467114
1760,clinton,3.336493
