In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
from collections import Counter


In [1]:
import nltk
from nltk.corpus import wordnet

# Use NLTK and the specific corpus

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
pip install flask

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [None]:
# BBC Dataset Load and cleaning

In [4]:
import pandas as pd
import re

# Load the dataset
df_bbc = pd.read_csv('bbc_news.csv')  

# Specify the name of the column want to remove
column_name_to_remove = 'guid'  

# Use the `drop()` method to remove the specified column
df_bbc = df_bbc.drop(column_name_to_remove, axis=1)

df_bbc['publisher'] = 'BBC'

# Check for missing values
missing_values = df_bbc.isnull().sum()

print(f"Number of Missing Values: {missing_values}")

# Check for NaN values in each column
for column in df_bbc.columns:
    if df_bbc[column].isna().any():
        print(f"Column '{column}' has NaN values.")
    else:
        print(f"Column '{column}' does not have any NaN values.")

# Check for duplicate rows based on all columns
duplicates = df_bbc.duplicated()

# Remove duplicates from the DataFrame
df_bbc.drop_duplicates(inplace=True)

# To display the duplicate rows (optional)
#duplicate_rows = df[duplicates]
#print("Duplicate Rows:")
#print(duplicate_rows)

# To count the number of duplicate rows
num_duplicates = duplicates.sum()
print(f"Number of Duplicate Rows: {num_duplicates}")

# Specify the column want to check for URLs
column_name = 'link' 

# Define a regular expression pattern to match URLs
url_pattern = r'https?://\S+|www\.\S+'

# Use the inverse of str.contains() to check for rows that do not contain URLs
df_bbc['does_not_contain_url'] = ~df_bbc[column_name].str.contains(url_pattern, case=False, flags=re.IGNORECASE)

# Calculate the sum of rows that do not contain URLs
sum_not_containing_url = df_bbc['does_not_contain_url'].sum()

# Display the sum
print(f"Sum of rows not containing URLs: {sum_not_containing_url}")

# Function to remove special characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Preprocess the text column
df_bbc['title'] = df_bbc['title'].apply(remove_special_characters)
df_bbc['description'] = df_bbc['description'].apply(remove_special_characters)


# Specify the column for calculate the sum of unique values
column_name = 'link'  
column_name_two = 'description'  

# Calculate the sum of unique values in the specified column
unique_sum = df_bbc[column_name].nunique()
unique_sum_two = df_bbc[column_name_two].nunique()

# Display the sum of unique values
#print(f"Sum of unique values in '{column_name}': {unique_sum}")
#print(f"Sum of unique values in '{column_name_two}': {unique_sum_two}")

# Specify the column want to check for identical data
column_name = 'link'  

# Use the `duplicated()` method to create a Boolean Series indicating duplicate values in the column
duplicates = df_bbc[column_name].duplicated(keep=False)

# Filter the DataFrame to include only rows with identical data in the specified column
identical_data_rows = df_bbc[duplicates].sum()

# Display the rows with identical data in the specified column
#print("Rows with Identical Data in '{}' Column:".format(column_name))
#print(identical_data_rows)

df_bbc.head()


Number of Missing Values: title          0
pubDate        0
link           0
description    0
publisher      0
dtype: int64
Column 'title' does not have any NaN values.
Column 'pubDate' does not have any NaN values.
Column 'link' does not have any NaN values.
Column 'description' does not have any NaN values.
Column 'publisher' does not have any NaN values.
Number of Duplicate Rows: 0
Sum of rows not containing URLs: 0


Unnamed: 0,title,pubDate,link,description,publisher,does_not_contain_url
0,Ukraine Angry Zelensky vows to punish Russian ...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,BBC,False
1,War in Ukraine Taking cover in a town under at...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-606418...,Jeremy Bowen was on the frontline in Irpin as ...,BBC,False
2,Ukraine war catastrophic for global food,"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941?a...,One of the worlds biggest fertiliser firms say...,BBC,False
3,Manchester Arena bombing Saffie Roussoss paren...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombings y...,BBC,False
4,Ukraine conflict Oil price soars to highest le...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...,BBC,False


In [None]:
# CNN News Dataset Load and cleaning

In [5]:
import pandas as pd
import re

# Load the dataset
df_cnn = pd.read_csv('CNN_news.csv')  

# Specify the names of the columns want to remove
columns_to_remove = ['Author', 'Index','Category', 'Section','Keywords','Second headline','Article text']  

# Use the `drop()` method to remove the specified columns
df_cnn = df_cnn.drop(columns=columns_to_remove)

# Create a dictionary to map the old column names to new column names
column_mapping = {
    'Headline': 'title',
    'Date published': 'pubDate',
    'Url': 'link',
    'Description': 'description',
}

# Rename the columns using the rename() function
df_cnn.rename(columns=column_mapping, inplace=True)

# Add a new column with the default value
df_cnn['publisher'] = 'CNN'

# Check for missing values
missing_values = df_cnn.isnull().sum()

print(f"Number of Missing Values: {missing_values}")

# Check for NaN values in each column
for column in df_cnn.columns:
    if df_cnn[column].isna().any():
        print(f"Column '{column}' has NaN values.")
    else:
        print(f"Column '{column}' does not have any NaN values.")
        
# Check for duplicate rows based on all columns
duplicates = df_cnn.duplicated()

# Remove duplicates from the DataFrame
df_cnn.drop_duplicates(inplace=True)

# To display the duplicate rows (optional)
#duplicate_rows = df[duplicates]
#print("Duplicate Rows:")
#print(duplicate_rows)

# To count the number of duplicate rows
num_duplicates = duplicates.sum()
print(f"Number of Duplicate Rows: {num_duplicates}")

# Specify the column want to check for URLs
column_name = 'link'  

# Define a regular expression pattern to match URLs
url_pattern = r'https?://\S+|www\.\S+'

# Use the inverse of str.contains() to check for rows that do not contain URLs
df_cnn['does_not_contain_url'] = ~df_cnn[column_name].str.contains(url_pattern, case=False, flags=re.IGNORECASE)

# Calculate the sum of rows that do not contain URLs
sum_not_containing_url = df_cnn['does_not_contain_url'].sum()

# Display the sum
print(f"Sum of rows not containing URLs: {sum_not_containing_url}")

# Function to remove special characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Preprocess the text column
df_cnn['title'] = df_cnn['title'].apply(remove_special_characters)
df_cnn['description'] = df_cnn['description'].apply(remove_special_characters)

# Specify the column for which want to calculate the sum of unique values
column_name = 'link' 
column_name_two = 'title'

# Calculate the sum of unique values in the specified column
unique_sum = df_cnn[column_name].nunique()
unique_sum_two = df_cnn[column_name_two].nunique()

# Display the sum of unique values
#print(f"Sum of unique values in '{column_name}': {unique_sum}")
#print(f"Sum of unique values in '{column_name_two}': {unique_sum_two}")

# Specify the column want to check for identical data
column_name = 'title'

# Use the `duplicated()` method to create a Boolean Series indicating duplicate values in the column
duplicates = df_cnn[column_name].duplicated(keep=False)

# Remove duplicates from the DataFrame
df_cnn.drop_duplicates(inplace=True)

# Filter the DataFrame to include only rows with identical data in the specified column
identical_data_rows = df_cnn[duplicates].sum()

# Display the rows with identical data in the specified column
#print("Rows with Identical Data in '{}' Column:".format(column_name))
#print(identical_data_rows)

df_cnn.head()

Number of Missing Values: pubDate        0
link           0
title          0
description    0
publisher      0
dtype: int64
Column 'pubDate' does not have any NaN values.
Column 'link' does not have any NaN values.
Column 'title' does not have any NaN values.
Column 'description' does not have any NaN values.
Column 'publisher' does not have any NaN values.
Number of Duplicate Rows: 0
Sum of rows not containing URLs: 0


Unnamed: 0,pubDate,link,title,description,publisher,does_not_contain_url
0,2021-07-15 02:46:59,https://www.cnn.com/2021/07/14/world/tusimple-...,Theres a shortage of truckers but TuSimple thi...,The ecommerce boom has exacerbated a global tr...,CNN,False
1,2021-05-12 07:52:09,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservos robotic Ironhand could protect facto...,Working in a factory can mean doing the same t...,CNN,False
2,2021-06-16 02:51:30,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,In a Hong Kong warehouse a swarm of autonomous...,CNN,False
3,2022-03-18 14:37:21,https://www.cnn.com/2022/03/18/success/pandemi...,Two years later remote work has changed millio...,Heres a look at how the pandemic reshaped peop...,CNN,False
4,2022-03-19 11:41:08,https://www.cnn.com/2022/03/19/investing/march...,Why March is so volatile for stocks CNN,March Madness isnt just for college basketball...,CNN,False


In [None]:
# CNBC News Dataset Load and cleaning

In [6]:
import pandas as pd
import re

# Load the dataset
df_cnbc = pd.read_csv('cnbc_news.csv') 


# Specify the names of the columns want to remove
columns_to_remove = ['author', 'short_description','header_image', 'keywords','raw_description','scraped_at']

# Use the `drop()` method to remove the specified columns
df_cnbc = df_cnbc.drop(columns=columns_to_remove)

# Create a dictionary to map the old column names to new column names
column_mapping = {
    'published_at': 'pubDate',
    'url': 'link',
}

# Rename the columns using the rename() function
df_cnbc.rename(columns=column_mapping, inplace=True)

# Check for missing values
missing_values = df_cnbc.isnull().sum()

print(f"Number of Missing Values: {missing_values}")

# Check for NaN values in each column
for column in df_cnbc.columns:
    if df_cnbc[column].isna().any():
        print(f"Column '{column}' has NaN values.")
    else:
        print(f"Column '{column}' does not have any NaN values.")

# Replace NaN values with the corresponding title value
df_cnbc['description'] = df_cnbc['description'].fillna(df_cnbc['title'])

# Check for duplicate rows based on all columns
duplicates = df_cnbc.duplicated()

# Remove duplicates from the DataFrame
df_cnbc.drop_duplicates(inplace=True)

# To display the duplicate rows (optional)
#duplicate_rows = df[duplicates]
#print("Duplicate Rows:")
#print(duplicate_rows)

# To count the number of duplicate rows
num_duplicates = duplicates.sum()
print(f"Number of Duplicate Rows: {num_duplicates}")

# Specify the column want to check for URLs
column_name = 'link'

# Define a regular expression pattern to match URLs
url_pattern = r'https?://\S+|www\.\S+'

# Use the inverse of str.contains() to check for rows that do not contain URLs
df_cnbc['does_not_contain_url'] = ~df_cnbc[column_name].str.contains(url_pattern, case=False, flags=re.IGNORECASE)

# Calculate the sum of rows that do not contain URLs
sum_not_containing_url = df_cnbc['does_not_contain_url'].sum()

# Display the sum
print(f"Sum of rows not containing URLs: {sum_not_containing_url}")

# Function to remove special characters
def remove_special_characters(text):
    if isinstance(text, str):  # Check if the input is a string-like object
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)
    else:
        return text


# Preprocess the text column
df_cnbc['title'] = df_cnbc['title'].apply(remove_special_characters)
df_cnbc['description'] = df_cnbc['description'].apply(remove_special_characters)

# Specify the column for which want to calculate the sum of unique values
column_name = 'link' 
column_name_two = 'title'

# Calculate the sum of unique values in the specified column
unique_sum = df_cnbc[column_name].nunique()
unique_sum_two = df_cnbc[column_name_two].nunique()

# Display the sum of unique values
print(f"Sum of unique values in '{column_name}': {unique_sum}")
print(f"Sum of unique values in '{column_name_two}': {unique_sum_two}")

# Specify the column want to check for identical data
column_name = 'link'

# Use the `duplicated()` method to create a Boolean Series indicating duplicate values in the column
duplicates = df_cnbc[column_name].duplicated(keep=False)

# Filter the DataFrame to include only rows with identical data in the specified column
identical_data_rows = df_cnbc[duplicates].sum()

# Remove duplicates from the DataFrame
df_cnbc.drop_duplicates(inplace=True)

# Display the rows with identical data in the specified column
print("Rows with Identical Data in '{}' Column:".format(column_name))
print(identical_data_rows)

df_cnbc.head()

Number of Missing Values: title           0
link            0
pubDate         0
publisher       0
description    32
dtype: int64
Column 'title' does not have any NaN values.
Column 'link' does not have any NaN values.
Column 'pubDate' does not have any NaN values.
Column 'publisher' does not have any NaN values.
Column 'description' has NaN values.
Number of Duplicate Rows: 0
Sum of rows not containing URLs: 0
Sum of unique values in 'link': 625
Sum of unique values in 'title': 625
Rows with Identical Data in 'link' Column:
title                   0.0
link                    0.0
pubDate                 0.0
publisher               0.0
description             0.0
does_not_contain_url    0.0
dtype: float64


Unnamed: 0,title,link,pubDate,publisher,description,does_not_contain_url
0,Santolis Wednesday market notes Could Septembe...,https://www.cnbc.com/2021/09/29/santolis-wedne...,2021-09-29T17:09:39+0000,CNBC,This is the daily notebook of Mike Santoli CNB...,False
1,My take on the early Brexit winners and losers,https://www.cnbc.com/2016/06/24/ian-bremmers-t...,2016-06-24T13:50:48-0400,CNBC,My take on the early Brexit winners and losers,False
2,Europe039s recovery depends on Renzi039s Italy,https://www.cnbc.com/2014/03/25/europes-recove...,2014-03-25T13:29:45-0400,CNBC,Europe039s recovery depends on Renzi039s Italy,False
3,US Moves Closer to Becoming A Major Shareholde...,https://www.cnbc.com/2009/04/22/us-moves-close...,2009-04-22T19:49:03+0000,CNBC,The US government is increasingly likely to co...,False
4,Trump Mission accomplished on perfectly execut...,https://www.cnbc.com/2018/04/14/trump-mission-...,2018-04-14T14:59:04+0000,CNBC,President Donald Trump hailed the USled interv...,False


In [7]:
import pandas as pd

# Concatenate the three DataFrames along the rows (simple appending)
merged_df = pd.concat([df_bbc, df_cnn, df_cnbc], ignore_index=True)

# Save the merged DataFrame to a new CSV file if needed
merged_df.to_csv('merged_file.csv', index=False)

merged_df.head()
# Get the total number of rows in the DataFrame
total_rows = merged_df.shape[0]

# Print the total number of rows
print("Total Rows:", total_rows)

Total Rows: 61582


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from fuzzywuzzy import fuzz
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

import time
from flask import Flask, render_template, request, g
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
app = Flask(__name__)

@app.before_request
def before_request():
    g.request_start_time = time.time()
    g.request_time = lambda: "%.5fs" % (time.time() - g.request_start_time)

# Function to check if the input contains a link
def contains_link(text):
    # Define a regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    
    # Use re.search to find a match in the input
    match = re.search(url_pattern, text)
    
    return match is not None

# Function to parse the title from Link
def parse_title(url):
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page, 'html.parser')
    title = soup.title.string
    return title

# Create a set of stop words 
stop_words = set(stopwords.words('english')) 

# Define a function to remove stop words from a sentence 
def remove_stop_words(sentence): 
  # Split the sentence into individual words 
  words = sentence.split() 
  
  # Use a list comprehension to remove stop words 
  filtered_words = [word for word in words if word not in stop_words] 
  
  # Join the filtered words back into a sentence 
  return ' '.join(filtered_words)

# Function to calculate string match accuracy
def calculate_match_accuracy(reference_string, test_string):
    return fuzz.ratio(reference_string, test_string)

# Function to perform K-Means clustering on a set of texts
def cluster_texts(texts, num_clusters):
    try:
        # Create TF-IDF vectors for the texts
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(texts)
        # Perform K-Means clustering
        #kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        
        # Perform Mini-Batch K-Means clustering
        kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, batch_size=1000, max_no_improvement=10, reassignment_ratio=0.01)
        
        kmeans.fit(tfidf_matrix)

        # Get cluster assignments for each text
        cluster_assignments = kmeans.labels_

        return cluster_assignments

    except Exception as e:
        print(f"Error processing texts: {e}")
        return None

# Function to check user input text match accuracy against CSV file
def check_user_input_accuracy(user_input, csv_filename, num_clusters):
    try:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_filename)

        # Extract the text column from the CSV
        csv_texts = df['title'].tolist()
        
        # Cluster both the user input and CSV texts
        user_input_cluster_assignments = cluster_texts([user_input], num_clusters)
        csv_cluster_assignments = cluster_texts(csv_texts, num_clusters)

        # Determine the cluster of the user input text
        user_input_cluster = user_input_cluster_assignments[0]

        # Find the CSV texts that belong to the same cluster as the user input
        matching_csv_texts = [csv_texts[i] for i, cluster in enumerate(csv_cluster_assignments) if cluster == user_input_cluster]

        # Calculate accuracy by comparing the user input with each matching CSV text
        accuracies = [calculate_match_accuracy(user_input, csv_text) for csv_text in matching_csv_texts]

        # Determine the highest accuracy among the matching texts
        max_accuracy = max(accuracies)

        return max_accuracy

    except Exception as e:
        print(f"Error processing user input or CSV file: {e}")
        return None
    
# Function for inserting user fake news check details in to database
def data_insert_db(email,news_data,status):
    # database connection
    connection = pymysql.connect(host="localhost", port=3306, user="root", passwd="", database="fake_news_data")

    cursor = connection.cursor()    

    # Insert data into the table
    insert_query = "INSERT INTO news_articles_data (email, news_data, status) VALUES (%s, %s, %s)"
    cursor.execute(insert_query, (email, news_data, status))

    # Commit the changes to the database
    connection.commit()

    # Execute SQL queries
    cursor.execute("SELECT * FROM news_articles_data")

    # Fetch all rows from the result set
    result = cursor.fetchall()

    # Print the results
    for row in result:
        print(row)

    # Close the cursor and the connection when done
    cursor.close()
    connection.close()
@app.route("/")
def index():
    return render_template('index.html')
@app.route('/check_news', methods=['POST'])
def check_news():
    if request.method == 'POST':
        user_input = request.form['news_text']
        email = request.form['email']
        csv_filename = "merged_file.csv"
         # Check if the user input contains a link
        if contains_link(user_input):
            user_input = parse_title(user_input)
            print("User input contains a link.")
        else:
            print("User input is plain text.")

        filtered_sentence = remove_stop_words(user_input) 
        num_clusters = min(1, len(user_input) + 1)   # Adjust the number of clusters as needed

        accuracy = check_user_input_accuracy(user_input, csv_filename, num_clusters)
        #status = 0
        #if accuracy is not None:
          #  print(f"User Input Match Accuracy: {accuracy}%")
        
        if accuracy>75:
            status = 1
        else:
            status = 0
        data_insert_db(email,user_input,status)
    if accuracy is not None:
       # print(f"Match Accuracy in '{csv_filename}':")
        return render_template('result.html', news_text=user_input, result=accuracy)

if __name__ == "__main__":
    app.run()






 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Oct/2023 14:32:24] "GET / HTTP/1.1" 200 -


User input is plain text.


127.0.0.1 - - [22/Oct/2023 14:33:17] "POST /check_news HTTP/1.1" 200 -


(1, 'c1055998@my.shu.ac.uk', 'hijn', '0', datetime.datetime(2023, 10, 10, 18, 13, 31))
(2, '10anu336@gmail.com', "Ukraine war: PM calls for 'step-by-step' move from Russian fuel - BBC News", '0', datetime.datetime(2023, 10, 12, 18, 25, 18))
(3, 'testemail@email.com', "Covid: Fourth jab for Scotland's vulnerable, and testing wind down fears in Wales - BBC News", '0', datetime.datetime(2023, 10, 12, 21, 7, 20))
(4, 'anupankriyas@gg.in', "Ukraine invasion: Volunteers 'working on autopilot' - BBC News", '0', datetime.datetime(2023, 10, 12, 21, 23, 38))
(5, 'ggg@we.in', "Ukraine invasion: Volunteers 'working on autopilot' - BBC News", '1', datetime.datetime(2023, 10, 12, 21, 41, 2))
(6, 'testemail@email.cok', 'Ukraine conflict: Petrol at fresh record as oil and gas prices soar - BBC News', '1', datetime.datetime(2023, 10, 16, 17, 19, 55))
(7, 'testemailone@email.com', 'Twitter is part of our war effort - Ukraine minister', '1', datetime.datetime(2023, 10, 22, 14, 29, 20))
(8, 'checkemail@te

In [None]:
def shutdown_server():
    func = request.environ.get('werkzeug.server.shutdown')
    if func is None:
        raise RuntimeError('Not running with the Werkzeug Server')
    func()
    
@app.get('/shutdown')
def shutdown():
    shutdown_server()
    return 'Server shutting down...'