# Necessary Steps

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/MediaCloud

/content/drive/MyDrive/MediaCloud


In [3]:
import os
pwd = os.getcwd()
pwd

'/content/drive/MyDrive/MediaCloud'

# Correct Clusters

### Change Years

In [18]:
import pandas as pd
import os
import glob


In [19]:
# Specify the input and output folder paths
input_folder = 'Farmers_Protest/7.Separate_Clusters/2024'
output_folder = 'Farmers_Protest/8.UPDATED_Separate_Clusters/2024'

In [20]:

# Get the list of all CSV files in the folder
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

# Create a dictionary to store DataFrames for each cleaned filename
combined_data = {}

# Process each file
for file in csv_files:
    # Extract the original filename
    original_filename = os.path.basename(file)

    # Remove initial keywords (e.g., "1._", "-_", etc.)
    cleaned_filename = original_filename.lstrip("0123456789.-_")

    # Read the CSV file
    df = pd.read_csv(file)

    # If the cleaned filename already exists, append the data
    if cleaned_filename in combined_data:
        combined_data[cleaned_filename] = pd.concat([combined_data[cleaned_filename], df], ignore_index=True)
    else:
        # Otherwise, store the data as a new entry
        combined_data[cleaned_filename] = df

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Save the combined DataFrames back as CSV files in the output folder
for cleaned_filename, df in combined_data.items():
    output_path = os.path.join(output_folder, cleaned_filename)
    df.to_csv(output_path, index=False)

print(f"Processed files saved in: {output_folder}")

Processed files saved in: Farmers_Protest/8.UPDATED_Separate_Clusters/2022


# Check Common Topis

In [None]:
parent_folder = 'Farmers_Protest/7.Separate_Clusters/'

In [None]:
# Get a list of all subfolders in the parent folder
subfolders = [os.path.join(parent_folder, subfolder) for subfolder in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, subfolder))]

# Dictionary to store CSV filenames for each year
csv_files_by_year = {}

# Collect CSV filenames (without paths) for each subfolder
for subfolder in subfolders:
    year = os.path.basename(subfolder)
    csv_files = [os.path.basename(file) for file in glob.glob(os.path.join(subfolder, '*.csv'))]
    csv_files_by_year[year] = set(csv_files)  # Use a set for easy intersection operations

# Find common CSV filenames across all years
common_csvs = set.intersection(*csv_files_by_year.values()) if csv_files_by_year else set()

# Print the results
print(f"Number of common CSV files across all years: {len(common_csvs)}")
print("Common CSV filenames:")
for csv in sorted(common_csvs):
    print(csv)

Number of common CSV files across all years: 6
Common CSV filenames:
Defense.csv
Economic_Policy.csv
Economic_Recovery.csv
Healthcare.csv
Political.csv
Tax_Policy.csv


# common across all the folders except 2019

In [None]:
import os
import glob

# Specify the parent folder path
parent_folder = 'Output/7.Separate_Clusters/'


In [None]:
# Get a list of all subfolders except 2019
subfolders = [
    os.path.join(parent_folder, subfolder)
    for subfolder in os.listdir(parent_folder)
    if os.path.isdir(os.path.join(parent_folder, subfolder)) and subfolder != '2019'
]

# Dictionary to store CSV filenames for each year
csv_files_by_year = {}

# Collect CSV filenames (without paths) for each subfolder
for subfolder in subfolders:
    year = os.path.basename(subfolder)
    csv_files = [os.path.basename(file) for file in glob.glob(os.path.join(subfolder, '*.csv'))]
    csv_files_by_year[year] = set(csv_files)  # Use a set for easy intersection operations

# Find common CSV filenames across all years (excluding 2019)
common_csvs = set.intersection(*csv_files_by_year.values()) if csv_files_by_year else set()

# Print the results
print(f"Number of common CSV files across all years (except 2019): {len(common_csvs)}")
print("Common CSV filenames:")
for csv in sorted(common_csvs):
    print(csv)

Number of common CSV files across all years (except 2019): 6
Common CSV filenames:
Defense.csv
Economic_Policy.csv
Economic_Recovery.csv
Healthcare.csv
Political.csv
Tax_Policy.csv


# common across all the folders except 2019 and 2020

In [None]:
import os
import glob

# Specify the parent folder path
parent_folder = 'Output/7.Separate_Clusters/'

In [None]:
# Get a list of all subfolders except 2019 and 2020
subfolders = [
    os.path.join(parent_folder, subfolder)
    for subfolder in os.listdir(parent_folder)
    if os.path.isdir(os.path.join(parent_folder, subfolder)) and subfolder not in ['2019', '2020']
]

# Dictionary to store CSV filenames for each year
csv_files_by_year = {}

# Collect CSV filenames (without paths) for each subfolder
for subfolder in subfolders:
    year = os.path.basename(subfolder)
    csv_files = [os.path.basename(file) for file in glob.glob(os.path.join(subfolder, '*.csv'))]
    csv_files_by_year[year] = set(csv_files)  # Use a set for easy intersection operations

# Find common CSV filenames across all years (excluding 2019 and 2020)
common_csvs = set.intersection(*csv_files_by_year.values()) if csv_files_by_year else set()

# Print the results
print(f"Number of common CSV files across all years (except 2019 and 2020): {len(common_csvs)}")
print("Common CSV filenames:")
for csv in sorted(common_csvs):
    print(csv)

Number of common CSV files across all years (except 2019 and 2020): 8
Common CSV filenames:
Capital_Markets.csv
Defense.csv
Economic_Policy.csv
Economic_Recovery.csv
Healthcare.csv
Infrastructure.csv
Political.csv
Tax_Policy.csv
