In [25]:
import pandas as pd
import glob
import os
import re

In [26]:
#can change school, supported schools are: McGill, Georgetown, York, CMU, USC, AU, LIU
SCHOOL = "AU"
keywords = ["China", "India", "Israel", "Palestine"]
data_path = f"journal_data/txt/{SCHOOL}"
output_path = f"bias_processing/data/1/{SCHOOL.lower()}_dataset.csv"
grouped_data_path = "grouped_data/csv"

In [27]:
# The output DataFrame
output_df = pd.DataFrame(columns=["date", "school", "keyword", "article"])

# Iterate through the keywords
for keyword in keywords:
    # Read the corresponding CSV file
    df = pd.read_csv(f"{grouped_data_path}/{SCHOOL}_{keyword}.csv")

    # Iterate through the dates in the DataFrame
    for date in df["date"]:
        # Format the date in the correct format for the .txt file name
        date_formatted = date.replace("-", "_")

        # Check if the .txt file exists
        txt_files = glob.glob(f"{data_path}/{date_formatted}*.txt")
        if txt_files:
            # Open the .txt file and read its content
            with open(txt_files[0], "r", encoding='utf-8', errors='ignore') as f:
                content = f.read()

            if(SCHOOL == "McGill"):
                # Split the content by the separator "The McGill Daily" repeated twice
                articles = content.split("The McGill Daily\nThe McGill Daily")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("Radio\nDownload file.*Powered by WordPress", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)

            elif(SCHOOL == "Georgetown"):
                # Split the content by the separator "Your email address will not be published. Required fields are marked *"
                articles = content.split("Your email address will not be published. Required fields are marked *\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("Comment.*Creative\n\n\n", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)

            elif(SCHOOL == "York"):
                # Split the content by the separator "York University Community Newspaper"
                articles = content.split("York University Community Newspaper\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("\s*York University Community Newspaper", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)

            elif(SCHOOL == "CMU"):
                # Split the content by the separator "Carnegie Mellon's Student Newspaper Since 1906."
                articles = content.split("Carnegie Mellon's Student Newspaper Since 1906.\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("Archives.*Contact Us\n\n\n", "", article, flags=re.DOTALL)
                        cleaned_article = re.sub("\n\n  .*The Tartan", "", cleaned_article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)

            elif(SCHOOL == "USC"):
            # Split the content by the separator "Extra en Español"
                articles = content.split("Extra en Español\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("This site .* allow them:", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)
            elif(SCHOOL == "AU"):
                # Split the content by the separator "Would you like to support our work? Donate here to The Eagle Innovation Fund."
                articles = content.split("Would you like to support our work? Donate here to The Eagle Innovation Fund.\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("(© .*State News\.)|(You can .*eagleonline\.com\.)", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)

            elif(SCHOOL == "LIU"):
                # Split the content by the separator "Your email address will not be published. Required fields are marked *" 
                articles = content.split("Your email address will not be published. Required fields are marked *.\n")

                # Iterate through the articles and check if the keyword is in the article
                for article in articles:
                    if keyword.lower() in article.lower():
                        # If the keyword is in the article, remove footer and create a new DataFrame and append it to the output DataFrame
                        cleaned_article = re.sub("(Your.* LIU Post)|(Official Newspaper of LIU Post)", "", article, flags=re.DOTALL)
                        new_row = pd.DataFrame({
                            "date": [date],
                            "school": [SCHOOL],
                            "keyword": [keyword],
                            "article": [cleaned_article.strip()]  # Remove leading/trailing white spaces
                        })
                        output_df = pd.concat([output_df, new_row], ignore_index=True)
            
            else:
                new_row = pd.DataFrame({
                    "date": [date],
                    "school": [SCHOOL],
                    "keyword": [keyword],
                    "article": [content.strip()]  # Remove leading/trailing white spaces
                })
                output_df = pd.concat([output_df, new_row], ignore_index=True)

# Add a line space between each article
output_df["article"] = output_df["article"] + "\n"

# Save the output DataFrame into a CSV file
output_df.to_csv(output_path, index=False)
print(output_df)

           date school    keyword  \
0    2009-02-12     AU      China   
1    2009-08-13     AU      China   
2    2009-08-13     AU      China   
3    2009-08-13     AU      China   
4    2009-08-13     AU      China   
..          ...    ...        ...   
121  2014-12-11     AU  Palestine   
122  2015-11-12     AU  Palestine   
123  2020-11-13     AU  Palestine   
124  2022-02-11     AU  Palestine   
125  2022-11-10     AU  Palestine   

                                               article  
0    A firefighter died after a 40-story Beijing lu...  
1    To think of D.C. is to think of following drea...  
2    Although swiping your card at TDR and sitting ...  
3    For newcomers to AU and to D.C., this list is ...  
4    There are only so many times that you can visi...  
..                                                 ...  
121  AU terminated its contract with the apparel co...  
122  *Updated: 5:09 p.m. Nov. 14. Corrections and c...  
123  Palestinian social justice event flye