# Necessary Steps

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install "mediacloud>=4.1.0"

Collecting mediacloud>=4.1.0
  Downloading mediacloud-4.3.0-py3-none-any.whl.metadata (4.3 kB)
Downloading mediacloud-4.3.0-py3-none-any.whl (19 kB)
Installing collected packages: mediacloud
Successfully installed mediacloud-4.3.0


In [4]:
import sys
import mediacloud.api
from importlib.metadata import version
import datetime as dt
from IPython.display import JSON
import bokeh.io
import os
import csv
#from dotenv import load_dotenv

In [5]:
cd /content/drive/MyDrive/MediaCloud

/content/drive/MyDrive/MediaCloud


In [6]:
pwd = os.getcwd()
pwd

'/content/drive/MyDrive/MediaCloud'

#  **1. SETUP**

In [None]:
# Set your personal API KEY
MC_API_KEY = 'ENTER KEY HERE'
search_api = mediacloud.api.SearchApi(MC_API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

In [None]:
# make sure your connection and API key work by asking for the total count of in 2023
results = search_api.story_count('*', dt.date(2023,11,1), dt.date(2023,12,31))
results

# **2. ATTENTION**

In [None]:
bokeh.io.reset_output()
bokeh.io.output_notebook()

In [None]:
# Set your personal API KEY
MC_API_KEY = 'ENTER KEY HERE'
search_api = mediacloud.api.SearchApi(MC_API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

'Using Media Cloud python client v4.3.0'

#### Attention from a Single Media Source

In [None]:
# check how many stories include the phrase "climate change" in the Washington Post (media id #2)
my_query = ' "budget" AND ("finance minister" OR "union budget" OR "fiscal policy" OR "tax reforms" OR "Nirmala" OR "budget speech" OR "budget allocation" OR "fiscal deficit") '
start_date = dt.date(2023, 1, 1)
end_date = dt.date(2023, 3,1)
sources = [39784]    # Media ID of Times of India = #39784
search_api.story_count(my_query, start_date, end_date, source_ids=sources)

{'relevant': 1561, 'total': 85152}

In [None]:
# you can see this count by day as well
results = search_api.story_count_over_time(my_query, start_date, end_date, source_ids=sources)
#JSON(results)

In [None]:
print(results)

[{'date': datetime.date(2023, 1, 1), 'total_count': 1014, 'count': 4, 'ratio': 0.0039447731755424065}, {'date': datetime.date(2023, 1, 2), 'total_count': 1329, 'count': 7, 'ratio': 0.005267118133935289}, {'date': datetime.date(2023, 1, 3), 'total_count': 1484, 'count': 10, 'ratio': 0.006738544474393531}, {'date': datetime.date(2023, 1, 4), 'total_count': 1632, 'count': 6, 'ratio': 0.003676470588235294}, {'date': datetime.date(2023, 1, 5), 'total_count': 1627, 'count': 9, 'ratio': 0.005531653349723417}, {'date': datetime.date(2023, 1, 6), 'total_count': 1559, 'count': 10, 'ratio': 0.006414368184733804}, {'date': datetime.date(2023, 1, 7), 'total_count': 1279, 'count': 6, 'ratio': 0.004691164972634871}, {'date': datetime.date(2023, 1, 8), 'total_count': 1160, 'count': 3, 'ratio': 0.002586206896551724}, {'date': datetime.date(2023, 1, 9), 'total_count': 1464, 'count': 11, 'ratio': 0.007513661202185792}, {'date': datetime.date(2023, 1, 10), 'total_count': 1539, 'count': 4, 'ratio': 0.00259

#### Manually aggregate by month or year

In [None]:
from collections import defaultdict

# Aggregating by month
monthly_counts = defaultdict(int)

# Assuming 'results' is a list of dictionaries with 'date' and 'count' keys
for result in results:
    # Ensure 'result' is a dictionary with 'date' and 'count'
    if isinstance(result, dict) and 'date' in result and 'count' in result:
        # Directly use the date object, no need to convert
        date = result['date']

        # Create a key in the format 'YYYY-MM' for monthly aggregation
        month_key = date.strftime('%Y-%m')

        # Sum up the counts for each month
        monthly_counts[month_key] += result['count']

# Print or use the aggregated monthly counts
for month, count in monthly_counts.items():
    print(f"{month}: {count}")


2023-01: 437
2023-02: 1103
2023-03: 21


#### Plot count

In [None]:
# and you can chart attention over time with some simple notebook work (using Bokeh here)
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
df = pd.DataFrame.from_dict(results)
df['date']= pd.to_datetime(df['date'])
source = ColumnDataSource(df)
p = figure(x_axis_type="datetime", width=900, height=250)
p.line(x='date', y='count', line_width=2, source=source)  # your could use `ratio` instead of `count` to see normalized attention
show(p)

### Normalizing within a Source

In [None]:
results = search_api.story_count(my_query, start_date, end_date, collection_ids=sources)
source_ratio = results['relevant'] / results['total']
'{:.2%} of the TOI stories were about "Annual Budget" between the start_data to end_data'.format(source_ratio)

'0.12% of the TOI stories were about "Annual Budget" between the start_data to end_data'

## Research Within a Country - using collections

In [None]:
# check in our collection of country-level India National media sources
my_query = ' "budget" AND ("finance minister" OR "union budget" OR "fiscal policy" OR "tax reforms" OR "Nirmala" OR "budget speech" OR "budget allocation" OR "fiscal deficit") '
start_date = dt.date(2023, 1, 1)
end_date = dt.date(2023, 3,1)
India_NATIONAL_COLLECTION = 34412118
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[India_NATIONAL_COLLECTION])
india_country_ratio = results['relevant'] / results['total']
'{:.2%} of stories from national-level India media sources mentioneded "budget"'.format(india_country_ratio)

'1.58% of stories from national-level India media sources mentioneded "budget"'

In [None]:
results

{'relevant': 14339, 'total': 910248}

In [None]:
# now we can compare this to the source-level coverage
coverage_ratio = 1 / (source_ratio / india_country_ratio)
'"budget" received {:.2} times less coverage in TOI than you might expect based on other India national papers'.format(coverage_ratio)

In [None]:
# or compare to another country (India in this case)
my_query = ' "budget" AND ("finance minister" OR "union budget" OR "fiscal policy" OR "tax reforms" OR "Nirmala" OR "budget speech" OR "budget allocation" OR "fiscal deficit") '
start_date = dt.date(2023, 1, 1)
end_date = dt.date(2023, 3,1)
US_NATIONAL_COLLECTION = 34412234
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[US_NATIONAL_COLLECTION])
us_country_ratio = results['relevant'] / results['total']
'{:.2%} of stories from national-level US media sources mentioned "budget"'.format(us_country_ratio)

'0.06% of stories from national-level Indian media sources in 2019 mentioned "climate change"'

In [None]:
coverage_ratio =  1 / (india_country_ratio / us_country_ratio)
'at the national level "climate change" is covered {:.2} times less in India than the US'.format(coverage_ratio)

At the national level, "budget" is covered 0.03 times more in India than in the US.


## Listing Stories

In [None]:
#my_query = ' "budget" AND ("finance minister" OR "union budget" OR "fiscal policy" OR "tax reforms" OR "Nirmala" OR "budget speech" OR "budget allocation" OR "fiscal deficit") '
my_query = ' "farmers" AND ("protest" OR "agitation" OR "demonstration" OR "farm laws" OR "MSP" OR "police crackdown" OR "march to Delhi") '

start_date = dt.date(2020, 1, 1)
end_date = dt.date(2025, 3,31)
India_NATIONAL_COLLECTION = 34412118

In [None]:
# grab the most recent stories about this issue
stories, _ = search_api.story_list(my_query, start_date, end_date, collection_ids=[India_NATIONAL_COLLECTION])
stories[:3]

RuntimeError: API Server Error 403. Params: {'start': '2020-01-01', 'end': '2025-03-31', 'q': ' "farmers" AND ("protest" OR "agitation" OR "demonstration" OR "farm laws" OR "MSP" OR "police crackdown" OR "march to Delhi") ', 'platform': 'onlinenews-mediacloud', 'cs': ('34412118',)}

In [None]:
len(stories)

1000

# Download

In [None]:
#my_query = ' "budget" AND ("finance minister" OR "union budget" OR "fiscal policy" OR "tax reforms" OR "Nirmala" OR "budget speech" OR "budget allocation" OR "fiscal deficit") '
my_query = ' "farmers" AND ("protest" OR "agitation" OR "farm laws" OR "MSP" OR "march to Delhi") '

start_date = dt.date(2022, 6, 1)
end_date = dt.date(2022, 9,30)
India_NATIONAL_COLLECTION = 34412118

In [None]:
import time

all_stories = []
more_stories = True
pagination_token = None

try:
    while more_stories:
        page, pagination_token = search_api.story_list(
            my_query, start_date, end_date,
            collection_ids=[India_NATIONAL_COLLECTION],
            pagination_token=pagination_token
        )
        all_stories += page
        more_stories = pagination_token is not None
        print(f"Fetched {len(page)} stories, Pagination Token: {pagination_token}")

        # Adding a small delay to prevent rate limiting
        time.sleep(1)

except RuntimeError as e:
    print(f"Error occurred: {e}")
except Exception as ex:
    print(f"Unexpected error: {ex}")

print(f"Total stories fetched: {len(all_stories)}")


Fetched 1000 stories, Pagination Token: MTcxNTA0NjY3OTYwOQEzNzE4NjMy
Fetched 1000 stories, Pagination Token: MTcxNDc4MjkxNjY4OAEzMjYzNjM5
Error occurred: API Server Error 403. Params: {'start': '2022-06-01', 'end': '2022-09-30', 'q': ' "farmers" AND ("protest" OR "agitation" OR "farm laws" OR "MSP" OR "march to Delhi") ', 'platform': 'onlinenews-mediacloud', 'cs': ('34412118',), 'pagination_token': 'MTcxNDc4MjkxNjY4OAEzMjYzNjM5'}
Total stories fetched: 2000


## Writing a CSV of Story Data

In [None]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [None]:
fieldnames = ['id', 'publish_date', 'title', 'url', 'language', 'media_name', 'media_url', 'indexed_date']
with open('Farmers_Protest/Farmers_Protest_Indian_National_2022B.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    for s in all_stories:
        writer.writerow(s)

## Top Media Sources

In [None]:
# List media producing the most stories matching the search
results = search_api.sources(my_query, start_date, end_date, collection_ids=sources)
print(results)

[{'source': 'indiatimes.com', 'count': 14955}, {'source': 'financialexpress.com', 'count': 5429}, {'source': 'business-standard.com', 'count': 5261}, {'source': 'thehindu.com', 'count': 4242}, {'source': 'news18.com', 'count': 3599}, {'source': 'indianexpress.com', 'count': 3254}, {'source': 'newindianexpress.com', 'count': 3170}, {'source': 'thehindubusinessline.com', 'count': 2732}, {'source': 'livemint.com', 'count': 2298}, {'source': 'india.com', 'count': 2149}, {'source': 'freepressjournal.in', 'count': 1935}, {'source': 'hindustantimes.com', 'count': 1774}, {'source': 'ndtv.com', 'count': 1664}, {'source': 'moneycontrol.com', 'count': 1589}, {'source': 'tribuneindia.com', 'count': 1311}, {'source': 'businessworld.in', 'count': 1280}, {'source': 'firstpost.com', 'count': 1008}, {'source': 'rediff.com', 'count': 984}, {'source': 'indiasnews.net', 'count': 938}, {'source': 'sify.com', 'count': 653}, {'source': 'oneindia.com', 'count': 616}, {'source': 'swarajyamag.com', 'count': 597

# **3. LANGUAGE**

In [None]:
bokeh.io.reset_output()
bokeh.io.output_notebook()

In [None]:
# Set your personal API KEY
MC_API_KEY = 'ENTER KEY HERE'
search_api = mediacloud.api.SearchApi(MC_API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

'Using Media Cloud python client v4.1.4'

## Simple Word Counts

Stemming: Words are stemmed by Elasticsearch before being counted. The term returned is the most used version of the stem in the sample.

In [None]:
# check how many stories include the phrase "climate change" in the Washington Post (media id #2)
my_query = '"climate change"' # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2022, 11, 1)
end_date = dt.date(2023, 12,1)
sources = [2]
results = search_api.words(my_query, start_date, end_date, source_ids=sources)
#JSON(results)

In [None]:
print(results)

[{'term': 'climate', 'count': 395, 'ratio': 0.395}, {'term': 'change', 'count': 99, 'ratio': 0.099}, {'term': 'biden', 'count': 67, 'ratio': 0.067}, {'term': 'new', 'count': 62, 'ratio': 0.062}, {'term': 'heat', 'count': 54, 'ratio': 0.054}, {'term': 'u.s', 'count': 44, 'ratio': 0.044}, {'term': 'world', 'count': 12, 'ratio': 0.012}, {'term': 'china', 'count': 10, 'ratio': 0.01}, {'term': 'gop', 'count': 10, 'ratio': 0.01}, {'term': 'house', 'count': 10, 'ratio': 0.01}]


## Languages Used

In [None]:
# See top languages used in articles
INDIA_NATIONAL = 34412118
results = search_api.languages('*', start_date, end_date, collection_ids=[INDIA_NATIONAL])
#JSON(results)

In [None]:
print(results)

[{'language': 'hi', 'value': 2817514, 'ratio': 0.47510960770559035}, {'language': 'en', 'value': 2680508, 'ratio': 0.4520066641484999}, {'language': 'ta', 'value': 286194, 'ratio': 0.04826010414418303}, {'language': 'mr', 'value': 47240, 'ratio': 0.007965950787826461}, {'language': 'ml', 'value': 29468, 'ratio': 0.0049691074897474635}, {'language': 'kn', 'value': 18845, 'ratio': 0.0031777803259227283}, {'language': 'te', 'value': 15472, 'ratio': 0.002609000647528599}, {'language': 'or', 'value': 10610, 'ratio': 0.0017891350097129289}, {'language': 'gu', 'value': 7102, 'ratio': 0.0011975906540038852}, {'language': 'pa', 'value': 6391, 'ratio': 0.0010776966868120009}]


In [None]:
# Retrieve latest stories in Hindi
page, _ = search_api.story_list('* and language:hi', start_date, end_date, collection_ids=[INDIA_NATIONAL])
page[:3]

[{'id': '54e361bc026775b5c4ed0106abee41d15e72a99793bedb5ef7ecda0df8e9bbc4',
  'media_name': 'amarujala.com',
  'media_url': 'amarujala.com',
  'title': 'आज के दिन भारत के पूर्व राष्ट्रपति ज्ञानी जैल सिंह का जन्म हुआ था, सुनिए 5 मई का इतिहास',
  'publish_date': datetime.date(2023, 5, 5),
  'url': 'https://www.amarujala.com/specials/5th-may-history-in-hindi?utm_source=rssfeed&utm_medium=Referral&utm_campaign=rssfeed',
  'language': 'hi',
  'indexed_date': datetime.datetime(2024, 8, 3, 17, 43, 58, 477319)},
 {'id': 'f190767f39b7aa0d8e9fc7d52853c3bb15323c8829e9ce544723f50d00e3d340',
  'media_name': 'indiatimes.com',
  'media_url': 'indiatimes.com',
  'title': 'भारत Vs साउथ अफ्रीका स्कोरकार्ड',
  'publish_date': datetime.date(2022, 11, 1),
  'url': 'https://navbharattimes.indiatimes.com/sports/cricket/live-score/sa-vs-ind/1-11-2022/scoreboard/matchid-sain01112022205362.cms',
  'language': 'hi',
  'indexed_date': datetime.datetime(2024, 8, 2, 2, 52, 2, 674871)},
 {'id': '8dd1c6518fc628a0a915

# **4. DIRECTORY**

In [None]:
bokeh.io.reset_output()
bokeh.io.output_notebook()

In [None]:
# Set your personal API KEY
MC_API_KEY = 'ENTER KEY HERE'
directory_api = mediacloud.api.DirectoryApi(MC_API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

'Using Media Cloud python client v4.1.4'

## Searching for Media Sources

In [None]:
# try to find a media source based on its URL
matching_sources = directory_api.source_list(name='hindustantimes')
#JSON(matching_sources['results'][0])

<IPython.core.display.JSON object>

In [None]:
matching_sources['results'][0]

{'id': 20258,
 'name': 'hindustantimes.com',
 'url_search_string': None,
 'label': 'hindustantimes.com',
 'homepage': 'http://www.hindustantimes.com',
 'notes': None,
 'platform': 'online_news',
 'stories_per_week': 4097,
 'first_story': None,
 'created_at': '2022-12-23T17:43:28.547804Z',
 'modified_at': '2024-08-31T13:46:06.821126Z',
 'pub_country': None,
 'pub_state': None,
 'primary_language': None,
 'media_type': None,
 'collection_count': 16}

## Media Source Feeds

In [None]:
# learn about the first result from above, which is our canonical one for the Hindustan Times
matching_sources = directory_api.source_list(name='hindustantimes')
hindustan_times = matching_sources['results'][0]

# list all the feeds associated with this media source
hindistan_times_feeds = directory_api.feed_list(hindustan_times['id'])
hindistan_times_feeds

{'count': 118,
 'next': 'https://search.mediacloud.org/api/sources/feeds/?limit=100&offset=100&source_id=20258',
 'previous': None,
 'results': [{'id': 422363,
   'url': 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml',
   'admin_rss_enabled': True,
   'source': 20258,
   'name': 'HindustanTimes- Top news',
   'created_at': '2023-02-09T22:55:40.496987Z',
   'modified_at': '2023-02-09T22:55:40.496987Z'},
  {'id': 422364,
   'url': 'http://www.hindustantimes.com/rss/india/rssfeed.xml',
   'admin_rss_enabled': True,
   'source': 20258,
   'name': 'HindustanTimes- India',
   'created_at': '2023-02-09T22:55:40.496987Z',
   'modified_at': '2023-02-09T22:55:40.496987Z'},
  {'id': 422368,
   'url': 'http://www.hindustantimes.com/rss/cities/bhopal/rssfeed.xml',
   'admin_rss_enabled': True,
   'source': 20258,
   'name': 'Hindustan Times - bhopal',
   'created_at': '2023-02-09T22:55:40.496987Z',
   'modified_at': '2023-02-09T22:55:40.496987Z'},
  {'id': 422369,
   'url': 'http://www.hind

## Collections

In [None]:
# search for a collection by name
nigerian_collections = directory_api.collection_list(name='nigeria')
[c['name'] for c in nigerian_collections['results']]

['Nigeria - National',
 'nigeria abyz 20160725',
 'Nigeria',
 'nigeria hausa 20161007',
 'nigeria mitpolisci 20160321',
 'nigeria additional 20160812',
 'gates nigeria 20160203',
 'nigeria egghead 20141017',
 'nigeria']

In [None]:
# page through a list of all the sources in the "Nigeria - National" collection
NIGERIA_NATIONAL = 38376341
sources = []
limit = 100
offset = 0
while True:
    response = directory_api.source_list(collection_id=NIGERIA_NATIONAL, limit=limit, offset=offset)
    sources += response['results']
    if response['next'] is None:
        break
    offset += limit
f"Found {len(sources)} media sources in Nigeria National collection geographic collections"

'Found 436 media sources in Nigeria National collection geographic collections'

In [None]:
print(sources)

[{'id': 18029, 'name': 'vanguardngr.com', 'url_search_string': None, 'label': 'Vanguard', 'homepage': 'http://www.vanguardngr.com/', 'notes': None, 'platform': 'online_news', 'stories_per_week': 1064, 'first_story': None, 'created_at': '2022-12-23T17:43:28.547804Z', 'modified_at': '2024-08-31T13:45:47.063328Z', 'pub_country': None, 'pub_state': None, 'primary_language': 'en', 'media_type': None, 'collection_count': 12}, {'id': 18027, 'name': 'sunnewsonline.com', 'url_search_string': None, 'label': 'The Sun', 'homepage': 'http://www.sunnewsonline.com/', 'notes': None, 'platform': 'online_news', 'stories_per_week': 614, 'first_story': None, 'created_at': '2022-12-23T17:43:28.547804Z', 'modified_at': '2024-06-12T20:34:40.777523Z', 'pub_country': 'USA', 'pub_state': 'US-PA', 'primary_language': 'en', 'media_type': 'print_native', 'collection_count': 11}, {'id': 18024, 'name': 'saharareporters.com', 'url_search_string': None, 'label': 'Sahara Reporters', 'homepage': 'http://www.saharareport

# Download for every month

In [8]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [9]:
import datetime as dt
import time
import csv
import os

India_NATIONAL_COLLECTION = 34412118
my_query = ' "farmers" AND ("protest" OR "agitation" OR "farm laws" OR "MSP" OR "march to Delhi") '

# Create output directory if not exists
output_dir = "Farmers_Protest"
os.makedirs(output_dir, exist_ok=True)

# Define date range
start_date = dt.date(2024, 7, 1)
end_date = dt.date(2024, 8, 31)

# Function to handle API request with exponential backoff
def fetch_stories(month_start, month_end):
    all_stories = []
    more_stories = True
    pagination_token = None
    count = 0
    retries = 0

    while more_stories and count < 1000:
        try:
            page, pagination_token = search_api.story_list(
                my_query, month_start, month_end,
                collection_ids=[India_NATIONAL_COLLECTION],
                pagination_token=pagination_token
            )
            all_stories += page
            count += len(page)
            more_stories = pagination_token is not None and count < 1000
            print(f"Fetched {len(page)} stories for {month_start.strftime('%Y-%m')}, Total: {count}")

            # Adding a small delay to prevent rate limiting
            time.sleep(1)
            retries = 0  # Reset retries on success
        except RuntimeError as e:
            print(f"Error occurred for {month_start.strftime('%Y-%m')}: {e}")
            break
        except Exception as ex:
            if "403" in str(ex) and retries < 5:
                wait_time = 2 ** retries  # Exponential backoff
                print(f"403 Error: Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                retries += 1
            else:
                print(f"Unexpected error for {month_start.strftime('%Y-%m')}: {ex}")
                break
    return all_stories

# Iterate through each month in the date range
current_date = start_date
while current_date <= end_date:
    month_start = current_date.replace(day=1)
    next_month = (month_start + dt.timedelta(days=32)).replace(day=1)
    month_end = next_month - dt.timedelta(days=1)

    # Fetch stories with improved error handling
    all_stories = fetch_stories(month_start, month_end)

    # Save to CSV
    if all_stories:
        filename = os.path.join(output_dir, f"Farmers_Protest_{month_start.strftime('%Y_%m')}.csv")
        fieldnames = ['id', 'publish_date', 'title', 'url', 'language', 'media_name', 'media_url', 'indexed_date']

        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            for s in all_stories:
                writer.writerow(s)
        print(f"Saved {len(all_stories)} stories to {filename}")

    # Move to the next month
    current_date = next_month


Fetched 1000 stories for 2024-07, Total: 1000
Saved 1000 stories to Farmers_Protest/Farmers_Protest_2024_07.csv
Fetched 858 stories for 2024-08, Total: 858
Saved 858 stories to Farmers_Protest/Farmers_Protest_2024_08.csv


# Merge

In [7]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [8]:
import os
import pandas as pd

# Define the path to the folder containing the CSV files
folder_path = 'Farmers_Protest/temp/'  # Replace with your folder path

# List all files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to hold dataframes
dfs = []

# Iterate over each CSV file, read it, and append the dataframe to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)

# Filter the dataframe to keep only rows where 'language' is 'en'
merged_df = merged_df[merged_df['language'] == 'en']

# Save the merged dataframe to a new CSV file
merged_df.to_csv('Farmers_Protest/Farmers_protest_Final_2024.csv', index=False)

print("CSV files have been successfully merged into 'Farmers_protest_Final.csv'")

CSV files have been successfully merged into 'Farmers_protest_Final.csv'
