### Installing abd importing requried libraries

In [None]:
!pip install mysql-connector-python
!pip install feedparser
!pip install sqlalchemy

In [None]:
import feedparser
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
from datetime import date
import re
import mysql.connector
from sqlalchemy import create_engine, Column, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

### Definig Functions

In [None]:
# Define keywords for each category
categories = {
    "Terrorism / Protest / Political Unrest / Riot": [
        "terrorism", "bomb", "attack", "insurgent", "militant", "extremist", "conflict", "violence",
        "demonstration", "protest", "riot", "revolution", "armed", "security", "strike", "uprising",
        "suppression", "police", "martial law", "hostage", "shooting", "arrest", "resistance", "clash",
        "military", "coup", "radical", "extremism", "war", "civil unrest"
    ],
    "Positive / Uplifting": [
        "achievement", "award", "breakthrough", "celebration", "charity", "donation", "education", "empowerment",
        "environment", "festival", "happiness", "hero", "hope", "humanity", "innovation", "inspiration", "joy",
        "kindness", "life-saving", "love", "milestone", "progress", "recovery", "success", "support", "sustainability",
        "triumph", "volunteer", "well-being", "win"
    ],
    "Natural Disasters": [
        "earthquake", "flood", "tsunami", "hurricane", "cyclone", "tornado", "avalanche", "wildfire", "landslide",
        "volcano", "eruption", "drought", "typhoon", "disaster", "storm", "monsoon", "mudslide", "tremor", "blizzard",
        "catastrophe", "evacuation", "relief", "rescue", "damage", "aftershock", "fatalities", "rainfall", "windstorm",
        "natural hazard", "heatwave"
    ],
    "Others": [
        "business", "technology", "sports", "entertainment", "health", "education", "politics", "economy", "finance",
        "travel", "lifestyle", "fashion", "culture", "science", "history", "space", "agriculture", "medicine", "religion",
        "art", "music", "law", "crime", "cybersecurity", "tourism", "transportation", "trade", "social media", "diplomacy",
        "military"
    ]
}

# Function to clean and tokenize text
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = text.lower().split()
    return tokens

# Function to classify text based on keyword matches
def classify_text(text):
    tokens = preprocess_text(text)
    keyword_count = {category: 0 for category in categories}

    # Count occurrences of category-specific keywords
    for category, keywords in categories.items():
        keyword_count[category] = sum(1 for token in tokens if token in keywords)

    # Find the category with the highest count of matched keywords
    best_category = max(keyword_count, key=keyword_count.get)

    # If no keywords match, classify as 'Others'
    if keyword_count[best_category] == 0:
        best_category = 'Others'

    return best_category


In [None]:
def transform_img_tags(content):
  """Transforms content with <img class... into HTML separated text."""
  if "<img class" in content:
    soup = BeautifulSoup(content, "html.parser")
    content = soup.get_text(separator=' ')
  return content

In [None]:
def format_date(date_string):
    try:
        # Check for GMT or time zone format and parse accordingly
        if "GMT" in date_string:
            date_object = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %Z')
        else:
            date_object = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
        return date_object.strftime('%d-%m-%Y')
    except ValueError:
        return date_string

### Extracting Data

In [None]:
RSS_Feeds = ["http://rss.cnn.com/rss/cnn_topstories.rss",
"http://qz.com/feed",
"http://feeds.foxnews.com/foxnews/politics",
"http://feeds.reuters.com/reuters/businessNews",
"http://feeds.feedburner.com/NewshourWorld",
"https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"]

today = date.today()
data = []
for RSS_Feed in RSS_Feeds:
  feed = feedparser.parse(RSS_Feed)
  for entry in feed.entries:
    title = entry.title
    published = entry.published if 'published' in entry else "Not Available"
    link = entry.link
    if 'content' in entry:
      raw_content = entry.content[0].value
      soup = BeautifulSoup(raw_content, "html.parser")
      content = soup.get_text().strip()
    else:
      content = entry.summary if 'summary' in entry else "Not Available"
    content = content.replace("\xa0",' ').replace("\n",'').replace("\'s","'s")
    data.append([title, content, published,today.strftime("%d-%m-%Y"), link, RSS_Feed,""])
df = pd.DataFrame(data, columns=['Title','Content','Published_date','Date_Added','Source_URL','Rss_Feed','Catogory'])


### Data Transformation

In [None]:
# Removing duplicate entries
df = df.drop_duplicates()

# Formatting Date Published Date
df['Published_date'] = df['Published_date'].apply(format_date)

# Cleaning and Formating content text 
df['Content'] = df['Content'].apply(transform_img_tags)

# Categorizing news articles
for i in range(len(df)):
    text = df['Title'][i] + ' ' + df['Content'][i]
    category = classify_text(text)
    df.loc[i, 'Catogory'] = category

# Replacing Blanks
df.fillna("Not Available", inplace=True)

### Connect to Database

In [None]:
db_connection_str = 'mysql+mysqlconnector://root:root@localhost/news_articles'

# Create an engine
engine = create_engine(db_connection_str)

# Try to connect to the database
engine.connect()

### Updating Data in Database

In [None]:

# Load existing data from MySQL
existing_data_df = pd.read_sql('SELECT * FROM news_data', engine)

# Load new data
new_data_df = df

# Identify unique records
unique_new_data = new_data_df[~new_data_df[['Title', 'Content','Published_date','Source_URL']].apply(tuple, axis=1).isin(existing_data_df[['Title', 'Content','Published_date','Source_URL']].apply(tuple, axis=1))]

# Updating unique records in Database
unique_new_data.to_sql('news_data', con=engine, if_exists='append', index=False)