> This notebook is trying to use [A Million News Headlines](https://www.kaggle.com/therohk/million-headlines) dataset to implement a fake news headlines detection model using machine learning approach. Where the A Million News Headlines dataset will be labeled as real news headlines. And this notebook will also use two fake news headline datasets on Kaggle from [Fake and real news](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset) and [Getting Real about Fake News](https://www.kaggle.com/mrisdal/fake-news) . 



In [4]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# **Load Datasets**

In [5]:
#Read Dataset
Headlines = pd.read_csv('dataset/abcnews-date-text.csv', usecols =["headline_text"]).dropna()
Headlines1 = pd.read_csv('dataset/fake.csv', usecols =["title"]).dropna()
# Headlines2 = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv', usecols =["title"]).dropna()

In [6]:
#Removing duplicated headlines
Headlines = Headlines.drop_duplicates('headline_text')
Headlines1 = Headlines1.drop_duplicates('title')
# Headlines2 = Headlines2.drop_duplicates('title')

In [7]:
#Rename dataframe columns and Combine all datasets
Headlines1 = Headlines1.rename(columns={'title': 'headline_text'})
# Headlines2 = Headlines2.rename(columns={'title': 'headline_text'})

# **Labelling**

In [8]:
#Creating lable for datasets
#million-headlines dataset will be used as real headlines
#fake-and-real-news-dataset & fake-news dataset will be used as fake headlines
Headlines['fake'] = 0
Headlines1['fake'] = 1
# Headlines2['fake'] = 1

# **Combine Datasets**

In [23]:
#Downsize million-headlines dataset to first 50K rows
data = pd.concat([Headlines[:50000],Headlines1])
print('Training dataset contains: {} Real headlines and {} Fake headlines.'.format(50000,len(Headlines1)))
data.to_csv('Combined_headlines.csv', index=False)

Training dataset contains: 50000 Real headlines and 11698 Fake headlines.


# **Data Preprocessing**

In [10]:
import gensim
import nltk as nl
nl.download("stopwords", download_dir='./nltk_data')
nl.download("punkt", download_dir='./nltk_data')
from sklearn.feature_extraction import text


nltk_stopwords = nl.corpus.stopwords.words('english')
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
sklearn_stopwords = text.ENGLISH_STOP_WORDS
combined_stopwords = sklearn_stopwords.union(nltk_stopwords,gensim_stopwords)

[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
print('NLTK has {} stop words'.format(len(nltk_stopwords)))
print('Gensim has {} stop words'.format(len(gensim_stopwords)))
print('Sklearn has {} stop words'.format(len(sklearn_stopwords)))
print('Combined stopwords list has {} stop words'.format(len(combined_stopwords)))

NLTK has 179 stop words
Gensim has 337 stop words
Sklearn has 318 stop words
Combined stopwords list has 390 stop words


In [12]:
from nltk.stem import PorterStemmer 
porter_stemmer = PorterStemmer() 

In [13]:
data['headline_text'] = data['headline_text'].apply(lambda x: x.lower())
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (combined_stopwords)]))

# **Splitting Dataset**

In [14]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(data['headline_text'], data['fake'], test_size=0.2, random_state=7)

# **Construct models with TF-IDF**

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding, GlobalMaxPooling1D
# from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [16]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = word_tokenize, max_features = 300)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
tfidf_features = tfidf_vectorizer.get_feature_names_out()



** RandomForest Classifier- Randomized Search**

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter distributions
param_dist = {
    'n_estimators': randint(10, 100),
    'max_depth': randint(1, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Random Forest model
rf = RandomForestClassifier()

# Set up RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50, # Number of parameter settings that are sampled
    cv=5, # Number of cross-validation folds
    verbose=2, # Controls the verbosity: the higher, the more messages
    random_state=42, # For reproducibility
    n_jobs=-1 # Use all processors
)

# Fit the model to the data
rand_search.fit(tfidf_train, y_train)

# Access the best model and its hyperparameters
best_rf = rand_search.best_estimator_
print('Best hyperparameters:', rand_search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


90 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
17 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Oscar Yu\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Oscar Yu\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\Oscar Yu\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Oscar Yu\AppData\Local\Programs\Python\Pyth

Best hyperparameters: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 56}


** Logistic Regression using Random Sampling-Balance Training Set**

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

class_0_indices = np.where(y_train == 0)[0]
additional_class_0_samples = np.random.choice(class_0_indices, size=1000, replace=False)
balanced_train_x = np.concatenate((tfidf_train, tfidf_train[additional_class_0_samples]))
balanced_train_y = np.concatenate((y_train, y_train[additional_class_0_samples]))

# Create a logistic regression model with balanced class weights
log_clf_balanced = LogisticRegression(penalty='none', class_weight='balanced', solver='saga', random_state=42)
log_clf_balanced.fit(balanced_train_x, balanced_train_y)

# Perform cross-validation
scores = cross_val_score(log_clf_balanced, balanced_train_x, balanced_train_y, cv=5)

print("Cross-validated scores:", scores)
print("Mean cross-validated score:", scores.mean())


ValueError: zero-dimensional arrays cannot be concatenated

In [22]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC(kernel='linear')
knn = KNeighborsClassifier()
nb = MultinomialNB()

dt.fit(tfidf_train, y_train)
rf.fit(tfidf_train, y_train)
svc.fit(tfidf_train, y_train)
knn.fit(tfidf_train, y_train)
nb.fit(tfidf_train, y_train)

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
log_clf_lasso = LogisticRegression(C = 0.1, class_weight= 'balanced', penalty= 'l1', solver= 'liblinear',random_state=42)
log_clf_lasso.fit(tfidf_train,y_train)
crossvalscore(log_clf_lasso)

In [None]:
print ("Testing Acc. of Decision Tree: {} %".format(round(dt.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Random Forest: {} %".format(round(rf.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of SVC: {} %".format(round(svc.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of K-NN: {} %".format(round(knn.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Naive Bayesian: {} %".format(round(nb.score(tfidf_test, y_test) * 100, 2)))

Testing Acc. of Decision Tree: 87.32 %
Testing Acc. of Random Forest: 88.13 %
Testing Acc. of SVC: 87.63 %
Testing Acc. of K-NN: 80.23 %
Testing Acc. of Naive Bayesian: 88.03 %


In [None]:
tfidf_train = tfidf_train.todense()
tfidf_test = tfidf_test.todense()

neural_network = Sequential()
neural_network.add(Dense(64, input_dim=len(tfidf_features), activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(64, activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(1, activation='sigmoid'))
neural_network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = neural_network.fit(tfidf_train, y_train, epochs=50, batch_size=512, verbose=0)
_,test_acc = neural_network.evaluate(tfidf_test,y_test,verbose=0)
print ("Testing Acc. of DNN: {} %".format(round(test_acc * 100, 2)))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Testing Acc. of DNN: 88.2 %
