# Necessary Steps

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

In [None]:
cd /content/drive/MyDrive/MediaCloud

/content/drive/MyDrive/MediaCloud


In [None]:
pwd = os.getcwd()
pwd

'/content/drive/MyDrive/MediaCloud'

# Equal Division of Newspapers Based on Years

In [None]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [None]:
mediacloud_data = pd.read_csv(os.path.join(pwd, "Farmers_Protest/FarmersProtest2024.csv"))

In [None]:
articles = 2000 # Number of articles you want to download for each year

In [None]:
# Convert the 'publish_date' column to datetime
mediacloud_data['publish_date'] = pd.to_datetime(mediacloud_data['publish_date'])

# Extract the year from the 'publish_date' column
mediacloud_data['year'] = mediacloud_data['publish_date'].dt.year


output_folder = os.path.join(pwd, "Farmers_Protest/1.YearWise_Data")
os.makedirs(output_folder, exist_ok=True)

# Define the date range
start_date = "-01-01"  # January 20
end_date = "-12-30"    # February 25

# Loop through each unique year and process the data
for year in mediacloud_data['year'].unique():
    # Filter the dataframe for the current year and the date range
    data_in_range = mediacloud_data[
        (mediacloud_data['year'] == year) &
        (mediacloud_data['publish_date'] >= pd.to_datetime(f"{year}{start_date}")) &
        (mediacloud_data['publish_date'] <= pd.to_datetime(f"{year}{end_date}"))
    ]

    # If there are fewer than 500 articles in the range, add more from the rest of the year
    if len(data_in_range) < articles:
        remaining_articles_needed = articles - len(data_in_range)
        data_outside_range = mediacloud_data[
            (mediacloud_data['year'] == year) &
            ~((mediacloud_data['publish_date'] >= pd.to_datetime(f"{year}{start_date}")) &
              (mediacloud_data['publish_date'] <= pd.to_datetime(f"{year}{end_date}")))
        ]

        # Sample remaining articles from the rest of the year
        additional_articles = data_outside_range.sample(
            n=min(remaining_articles_needed, len(data_outside_range)), random_state=42
        )

        # Combine both datasets
        data_sampled = pd.concat([data_in_range, additional_articles])
    else:
        # If sufficient articles exist within the range, sample only from this range
        data_sampled = data_in_range.sample(n=articles, random_state=42)

    # Save the sampled dataframe to a CSV file
    output_path = os.path.join(output_folder, f'data_{year}.csv')
    data_sampled.to_csv(output_path, index=False)

    print(f"Saved {len(data_sampled)} articles for the year {year} to {output_path}")

KeyError: 'indexed_date'



> **As of now I have manually kept only data of years 2020, 2021, 2022, 2023, and 2024**



# Parse the data for each individual year

In [None]:
!pip install newspaper3k
!pip install lxml_html_clean


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [None]:
import os
import time
import pandas as pd
from newspaper import Article
import nltk
from requests.exceptions import HTTPError, ConnectionError
import urllib3

In [None]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [None]:
#nltk.download('punkt')
nltk.download('punkt_tab')

source_folder = os.path.join(pwd, 'Farmers_Protest/1.YearWise_DataTEMP')

destination_folder = os.path.join(pwd, 'Farmers_Protest/2.Parsed_YearWise_DataTEMP')
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Function to process a single CSV file
def process_csv_file(file_path, destination_folder):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Add empty columns for parsed data
    df['parsed_title'] = ''
    df['parsed_authors'] = ''
    df['parsed_publish_date'] = ''
    df['parsed_text'] = ''
    df['parsed_category'] = ''
    df['parsed_summary'] = ''
    df['parsed_keywords'] = ''

    # Record the start time
    start_time = time.time()

    # Loop through each URL in the CSV
    for index, row in df.iterrows():
        url = row['url']  # Assuming the column containing URLs is 'url'

        try:
            # Validate URL format
            if not url.startswith('http'):
                raise ValueError(f"Invalid URL: {url}")

            # Download and parse the article
            article = Article(url, language="en")
            article.download()
            article.parse()

            # Store the parsed data in the respective columns
            df.at[index, 'parsed_title'] = article.title
            df.at[index, 'parsed_authors'] = ', '.join(article.authors)
            df.at[index, 'parsed_publish_date'] = article.publish_date
            df.at[index, 'parsed_text'] = article.text

            # Perform NLP operations
            article.nlp()
            df.at[index, 'parsed_keywords'] = ', '.join(article.keywords)
            df.at[index, 'parsed_summary'] = article.summary

            # Extract category from the URL
            category_parts = url.split('/')
            if len(category_parts) > 3:  # Assuming category is in the fourth segment
                df.at[index, 'parsed_category'] = category_parts[3]
            else:
                df.at[index, 'parsed_category'] = None

        except HTTPError as http_err:
            print(f"HTTP error occurred: {http_err} for URL: {url}")
        except ConnectionError as conn_err:
            print(f"Connection error occurred: {conn_err} for URL: {url}")
        except urllib3.exceptions.MaxRetryError as retry_err:
            print(f"Max retries exceeded for URL: {url}")
        except ValueError as val_err:
            print(f"Value error: {val_err}")
        except Exception as e:
            print(f"Failed to process URL: {url} with error: {e}")

        # Adding a small delay to avoid overwhelming the server with requests
        time.sleep(1)

    # Record the end time
    end_time = time.time()

    # Calculate the total time taken
    execution_time = end_time - start_time
    print(f"Total time taken to process {file_path}: {execution_time} seconds")

    # Get the file name from the original path
    file_name = os.path.basename(file_path)

    # Create the destination file name (e.g., Parsed_2020.csv)
    destination_file_path = os.path.join(destination_folder, f"Parsed_{file_name}")

    # Save the updated DataFrame to the destination folder
    df.to_csv(destination_file_path, index=False)

# Loop through each CSV file in the source folder
for file_name in os.listdir(source_folder):
    # Process only CSV files
    if file_name.endswith('.csv'):
        file_path = os.path.join(source_folder, file_name)
        print(f"Processing {file_path}...")

        # Call the function to process each CSV
        process_csv_file(file_path, destination_folder)

print("All files processed successfully!")


Processing /content/drive/MyDrive/MediaCloud/Farmers_Protest/1.YearWise_DataTEMP/Farmers_protest_Final_2024.csv...
Failed to process URL: https://www.newsclick.in/bengal-intense-heat-water-crisis-killing-paddy-seeds-bankura-purulia with error: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsclick.in/bengal-intense-heat-water-crisis-killing-paddy-seeds-bankura-purulia on URL https://www.newsclick.in/bengal-intense-heat-water-crisis-killing-paddy-seeds-bankura-purulia
Failed to process URL: https://www.newsclick.in/farmers-protest-posters-crop-some-punjab-villages-declaring-no-entry-bjp with error: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsclick.in/farmers-protest-posters-crop-some-punjab-villages-declaring-no-entry-bjp on URL https://www.newsclick.in/farmers-protest-posters-crop-some-punjab-villages-declaring-no-entry-bjp
Failed to process URL: https://www.hindustantimes.com/trending/employee-denied-leaves

# Remove Those Articles Where Text Was Not Parsed Somehow

In [None]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [None]:
parsed_data_folder = os.path.join(pwd, 'Farmers_Protest/2.Parsed_YearWise_DataTEMP')

In [None]:
def clean_csv_file(file_path):

    dff = pd.read_csv(file_path)
    dff_cleaned = dff.dropna(subset=['parsed_text'])
    dff_cleaned.to_csv(file_path, index=False)


for file_name in os.listdir(parsed_data_folder):
    # Process only CSV files
    if file_name.endswith('.csv'):
        file_path = os.path.join(parsed_data_folder, file_name)
        print(f"Cleaning {file_path}...")

        # Call the function to clean each CSV
        clean_csv_file(file_path)

print("All files cleaned successfully!")

Cleaning /content/drive/MyDrive/MediaCloud/Farmers_Protest/2.Parsed_YearWise_DataTEMP/Parsed_Farmers_protest_Final_2024.csv...
All files cleaned successfully!
