In [None]:
'''
This analysis is performed using Dataset_10k.csv provided Rutgers Bloustein MPI for competition use. 
Below are column names and their corresponding significance.
1. title: The headline of the news article.
2. link: The URL linking to the full news article.
3. date: Publication date of the article.
4. source: The news outlet or platform where the article was published.
5. country: The country where the news source is based.
6. language: The language in which the article is written.
7. translated_title: The title translated into English (if the original is in a different language).
8. number_of_characters_title: The total character count in the title.
9. number_of_words_title: The total word count in the title.
10. day_of_week: The day of the week on which the article was published.
11. month: The month of publication.
12. year: The year of publication.
13. quarter: The quarter of the year in which the article was published.
14. is_weekend: A boolean indicating if the article was published on a weekend.
15. is_holiday: A boolean indicating if the article was published on a public holiday.
16. source_type: The type of source (e.g., newspaper, online portal, blog).
17. final_redirected_URL: The final URL after any redirections.
18. domain_of_URL: The domain name of the website.
19. subdomain_of_URL: The subdomain portion of the website URL, if any.
20. URL_depth: The depth of the URL measured in the number of slashes.
21. top_level_domain: The top-level domain (e.g., .com, .org) of the website.
22. url_length: The length of the URL measured in characters.
23. author: The name of the author of the article.
'''
import pandas as pd
import numpy as np

data = pd.read_csv("Dataset_10k.csv")

#Let's break down this data to perform some sentiment analysis.
'''
To preface, all this data is from the US and published in 2023. 
We are filtering by different values such as languages and days of the week to observe some patterns. 
'''
english_data = data[data["language"]=='en']
korean_data = data[data["language"]=='ko']
japanese_data = data[data["language"]=='ja']
spanish_data = data[data["language"]=='es']
french_data = data[data["language"]=='fr']
indonesian_data = data[data["language"]=='id']
german_data = data[data["language"]=='de']
vietnamese_data = data[data["language"]=='vi']
portuguese_data = data[data["language"]=='pt']
italian_data = data[data["language"]=='it']
dutch_data = data[data["language"]=='nl']
polish_data = data[data["language"]=='pl']

In [None]:
languages = data["language"]
language_counts = languages.value_counts()
#print(language_counts)
day_of_week = data["day_of_week"]
day_counts = day_of_week.value_counts()
#print(day_counts)
source_type = data["source_type"]
source_counts = source_type.value_counts()
#print(source_counts)

In [None]:
from collections import Counter
import re
'''Currently working with ONLY English data for efficiency.'''
titles = english_data["title"].astype(str)
all_titles_text = ' '.join(titles)
clean_text = re.sub(r'[^a-zA-Z\s]', '', all_titles_text).lower()
words = clean_text.split()
word_frequencies = Counter(words)
words_df = pd.DataFrame(list(word_frequencies.items()), columns=['Word','Frequency'])
words_df = words_df.sort_values(by='Frequency', ascending=False)
words_to_remove = ['the','and','a','to','of','in','for','is','with','on']
words_df = words_df[~words_df['Word'].isin(words_to_remove)]
print(words_df)