# FAKE NEWS
### Dataset Description

**_train.csv_**: A full training dataset with the following attributes:

- **id**: unique id for a news article
- **title**: the title of a news article
- **author**: author of the news article
- **text**: the text of the article; could be incomplete
- **label**: a label that marks the article as potentially unreliable
    - 1: unreliable
    - 0: reliable

**_test.csv_**: A testing training dataset with all the same attributes at train.csv without the label.

DATASET LINK: https://www.kaggle.com/c/fake-news/data

In [15]:
# IMPORTING LIBRARIES

import pandas as pd 
import tensorflow as tf
import nltk
import re 
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

In [4]:
df = df.dropna()

In [5]:
# Dependent and Independent Features

X = df.drop('label', axis=1)
y = df['label']


In [None]:
y.value_counts()

In [None]:
X.shape

In [None]:
y.shape

In [11]:
vocabulary_size = 5000

In [None]:
# One-hot Representation

messages = X.copy()

messages['title'][0]
messages.reset_index(inplace=True)

In [None]:
nltk.download('stopwords')

In [None]:
# Dataset Preprocessing

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]',' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [21]:
one_hot_repr=[one_hot(words, vocabulary_size) for words in corpus]

In [None]:
one_hot_repr

In [None]:
# Embedding Representation

sentence_length = 20
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sentence_length)
print(embedded_docs)

In [None]:
embedded_docs[0]

In [None]:
# Model Creation
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features, input_length=sentence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
    )
print(model.summary())


In [None]:
# Model Creation
embedding_vector_features = 40
model1 = Sequential()
model1.add(Embedding(vocabulary_size, embedding_vector_features, input_length=sentence_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
    )
print(model1.summary())


In [None]:
len(embedded_docs), y.shape

In [None]:
# train-test split
import numpy as np
from sklearn.model_selection import train_test_split

X_final = np.array(embedded_docs)
y_final = np.array(y)

# X_final.shape, y_final.shape

X_train, y_train, X_test, y_test = train_test_split(X_final, y_final,
                                                    test_size=0.3, random_state=42)


In [None]:
model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10,
           batch_size=64)

In [33]:
# Performance Metrics and Accuracy

y_pred = model1.predict_classes(X_test)


In [None]:
from skelarn.metrics import classification_report
print(classification_report(y_test, y_pred))