In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

In [3]:
# core system imports
import os

import pandas as pd # Managing dataframe
import numpy as np # Handling mathematical operations on array
import pickle # library for saving the state of your dataframe
from collections import Counter

from keras import optimizers
from keras.regularizers import l2
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import (Sequential, load_model)

from keras.layers import (
    Dense,
    Dropout,
    LSTM,
    Conv1D, 
    MaxPooling1D, 
    Flatten,
    TimeDistributed
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from DataPrep.Clean_Texts import clean_text


ModuleNotFoundError: No module named 'keras'

In [None]:
# Read data from CSV file
data = pd.read_csv('./data/data1.csv')
data.head()

In [None]:
#Then we add a flag to track fake and real:
data['target'] = 'Positive'
data['target'] = 'Negative'
data['target'] = 'Neutral'

#Now lets concatenate the data frames:
#data = pd.concat([fake, true]).reset_index(drop = True)

#We will shuffle the data to prevent bias:
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

#Data cleansing
#Removing the date (we wonâ€™t use it for the analysis):
#data.drop(["date"],axis=1,inplace=True)

#Removing the title (we will only use the text):
#data.drop(["title"],axis=1,inplace=True)

#Convert the text to lowercase:
data['text'] = data['text'].apply(lambda x: x.lower())

#Remove punctuation:
import string
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str
data['text'] = data['text'].apply(punctuation_removal)
from matplotlib import pyplot as plt
'''
#Remove stopwords:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


#Data Exploration
#How many articles per subject?
#%matplotlib inline
from matplotlib import pyplot as plt
print(data.groupby(['subject'])['text'].count())
data.groupby(['subject'])['text'].count().plot(kind="bar")
plt.show()
'''
#How many fake and real articles?
print(data.groupby(['target'])['text'].count())
data.groupby(['target'])['text'].count().plot(kind="bar")
plt.show()
'''
#Word Cloud for fake news:
from wordcloud import WordCloud
fake_data = data[data["target"] == "fake"]
all_words = ' '.join([text for text in fake_data.text])
wordcloud = WordCloud(width= 800, height= 500,
                          max_font_size = 110,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#Word Cloud for real news:
real_data = data[data["target"] == "true"]
all_words = ' '.join([text for text in fake_data.text])
wordcloud = WordCloud(width= 800, height= 500, max_font_size = 110,
 collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
'''
#Most frequent words function:
# Most frequent words counter (Code adapted from https://www.kaggle.com/rodolfoluna/fake-news-detector)   
import nltk
import seaborn as sns
from nltk import tokenize
token_space = tokenize.WhitespaceTokenizer()
def counter(text, column_text, quantity):
    all_words = ' '.join([text for text in text[column_text]])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Frequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns = "Frequency", n = quantity)
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data = df_frequency, x = "Word", y = "Frequency", color = 'blue')
    ax.set(ylabel = "Count")
    plt.xticks(rotation='vertical')
    plt.show()
        
#Most frequent words in fake news:   
counter(data[data["target"] == "data1"], "text", 20)

#Most frequent words in real news:
#counter(data[data["target"] == "true"], "text", 20)
'''
"""
Modeling
The modeling process will consist of vectorizing the corpus stored in the â€œtextâ€ column, then applying TF-IDF, and finally a classification machine learning algorithm. Pretty standard in text analytics and NLP.
For modeling, we have this function to plot the confusion matrix of the models"""
# Function to plot the confusion matrix (code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Split the data:
X_train,X_test,y_train,y_test = train_test_split(data['text'], data.target, test_size=0.2, random_state=42)


#Logistic Regression
# Vectorizing and applying TF-IDF
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])


# Fitting the model
model = pipe.fit(X_train, y_train)
# Accuracy
prediction = model.predict(X_test)
print(" ")
print("Logistic Regression")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")
cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 20, 
                                           splitter='best', 
                                           random_state=42))])
# Fitting the model
model = pipe.fit(X_train, y_train)
# Accuracy
prediction = model.predict(X_test)
print("Deciaion Tree")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")
cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])

# Random Forest
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=50, criterion="entropy"))])
model = pipe.fit(X_train, y_train)
prediction = model.predict(X_test)
print("Random Forest")
print(" ")
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
print(" ")
print(" ")

cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])
'''















