In [2]:
# Import pandas for data handling
import pandas as pd
import pickle

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

# Import our data
df = pd.read_csv('../data/fake_or_real_news.csv')
print(df.shape)
df.head(20)

#no null values
print(df.isnull().sum())
#no dublicates
print(df.duplicated().sum())

df['label'].value_counts(normalize=True)

#removing unwanted characters

def remove_unwanted_char(a_string):    
    a_string = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(a_string))
    return a_string

remove_unwanted_char(df['title'])
print(df)

# Remove all punctuation

def remove_punctuation(a_string):    
    a_string = re.sub(r'[^\w\s]','', str(a_string))
    return a_string

remove_punctuation(df['title'])

def make_lower(a_string):
    return a_string.lower()

# Remove all stopwords

def remove_stopwords(a_string):
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string
            
remove_stopwords(str(df['title']))

def text_pipeline(input_string):
    input_string = make_lower(input_string)
    input_string = remove_punctuation(input_string)
    #input_string = lem_with_pos_tag(input_string)
    input_string = remove_stopwords(input_string)    
    return input_string


df['article_clean'] = df['text']
df['article_clean'] = df['text'].apply(text_pipeline)

print("ORIGINAL TEXT:", df['article_clean'])
print("CLEANDED TEXT:", df['article_clean'])


# Build the Vectorizer

# Define our `X` and `y` data. 

X = df['article_clean'].values

y = df['label'].values

# Split our data into testing and training like always. 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Save the raw text for later just incase
X_train_text = X_train
X_test_text = X_test

# Initialize our vectorizer
vectorizer = TfidfVectorizer()

# This makes your vocab matrix
vectorizer.fit(X_train)

pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

# This transforms your documents into vectors.
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, type(X))

title_clean = vectorizer.get_feature_names()
label = vectorizer.idf_

print(len(title_clean), len(label))

df_idf = pd.DataFrame.from_dict( {'article_clean': title_clean, 'label': label})

df_idf = df_idf.sort_values(by='label', ascending=False)

#Yussef 
#Initialize and train the model. Here we are using the MultinomialNB model.

model = MultinomialNB(alpha=.05)

model.fit(X_train, y_train)

pickle.dump(model, open('model.pkl', 'wb'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yusse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yusse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yusse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(6335, 4)
Unnamed: 0    0
title         0
text          0
label         0
dtype: int64
0
      Unnamed: 0                                              title  \
0           8476                       You Can Smell Hillary’s Fear   
1          10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2           3608        Kerry to go to Paris in gesture of sympathy   
3          10142  Bernie supporters on Twitter erupt in anger ag...   
4            875   The Battle of New York: Why This Primary Matters   
...          ...                                                ...   
6330        4490  State Department says it can't find emails fro...   
6331        8062  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...   
6332        8622  Anti-Trump Protesters Are Tools of the Oligarc...   
6333        4021  In Ethiopia, Obama seeks progress on peace, se...   
6334        4330  Jeb Bush Is Suddenly Attacking Trump. Here's W...   

                                                   text la