In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
#from matplotlib.ticker import FuncFormatter

In [None]:
df = pd.read_csv('news_articles.csv', index_col=0)
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['source_id'].unique()
df = df[df['source_id'] != '1']

# Assuming your DataFrame is named df
columns_to_convert = [
    'top_article', 
    'engagement_reaction_count', 
    'engagement_comment_count', 
    'engagement_share_count', 
    'engagement_comment_plugin_count'
]

# Convert the specified columns to integers
df[columns_to_convert] = df[columns_to_convert].astype('Int64')

In [None]:
df

In [None]:
pd.Series(df['source_id'].unique()).to_json('sources.json')

In [None]:
df['author'].value_counts()

In [None]:
# Creating a DataFrame from the provided data
outlet_data = pd.DataFrame({
    "source_name": [
        "Reuters", "The Irish Times", "Al Jazeera English", "BBC News", "ABC News",
        "The New York Times", "CNN", "CBS News", "Newsweek", "Business Insider",
        "The Wall Street Journal", "ESPN"
    ],
    "leaning": [
        "Center", "NA", "Left-Center", "Center", "Left-Center",
        "Left-Center", "Left", "Left-Center", "Center", "Center",
        "Center", "Left-Center"
    ]
})
df = df.merge(outlet_data, on='source_name')
df.head()

In [None]:
plt.figure(figsize=(10, 6))
ax = df['source_name'].value_counts().plot(kind='bar')

# Set plot title and labels
plt.title("Counts of Each Source Name")
plt.xlabel("Source Name")
plt.ylabel("Count")

# Adding the count labels on top of each bar
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2, p.get_height()),
                ha='center', va='bottom', fontsize=10)

# Tilt the x-axis labels by 45 degrees
#plt.xticks(rotation=70)

# Display the plot
plt.show()

In [None]:

# Dictionary for color mapping based on 'leaning'
color_dict = {
    "Left": "blue",
    "Left-Center": "orange",
    "Center": "gray",
    "NA": "black"
}

# Map colors based on the 'leaning' column
df['color'] = df['leaning'].map(color_dict)
# Count the occurrences of each 'source_name'
source_counts = df['source_name'].value_counts().reset_index()
source_counts.columns = ['source_name', 'count']
print(source_counts.head())
# Merge color information back into the counts DataFrame
source_counts = source_counts.merge(df[['source_name', 'leaning', 'color']].drop_duplicates(), on='source_name')

# Plot using Plotly Express with custom colors
fig = px.bar(
    source_counts,
    x='source_name',
    y='count',
    color='leaning',
    color_discrete_map=color_dict,
    title="Counts of Each Source by Leaning",
    labels={'count': 'Count', 'source_name': 'Source Name'},
    text='count'  # Show count on top of bars
)

# Customize layout for readability
fig.update_layout(xaxis_tickangle=70)
fig.show()


In [None]:
# Assuming we have the dataframe `df` with a 'published_at' column

df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

# Drop rows with NaT (if any dates couldn't be converted)
df = df.dropna(subset=['published_at'])
print(df.dtypes)
# Group by date to count the number of articles published each day
articles_per_day = df['published_at'].dt.date.value_counts().sort_index()

# Create a plot to show the timeline of articles published
plt.figure(figsize=(12, 6))
articles_per_day.plot(kind='line', marker='o')

# Set plot title and labels
plt.title("Timeline of Articles Published")
plt.xlabel("Date")
plt.ylabel("Number of Articles Published")

# Display the plot
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
# Aggregate the engagement metrics by 'source_name'
def aggregate_engagement(aggregation_function=sum):
    df_inside = df.drop(['top_article'], axis=1)
    df_inside['source_name_and_leaning'] = df_inside['source_name'] + ' (' + df_inside['leaning'] + ')'
    aggregated_engagement = df_inside.groupby('source_name_and_leaning').agg(func=aggregation_function, numeric_only=True)

    # Plotting the aggregated engagement metrics for each source_name
    plt.figure(figsize=(14, 8))
    aggregated_engagement.plot(kind='bar', stacked=False, figsize=(14, 8))
    
    # Set plot title and labels
    plt.title(f"{str(aggregation_function).upper()} Engagement Metrics by Source Name")
    plt.xlabel("Source Name")
    plt.ylabel("Aggregated Engagement Counts")
    
    # Format the y-axis to show readable numbers instead of scientific notation
    #ax.ticklabel_format(style='plain', axis='y')
    
    # Display the plot with rotation for better readability
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--')
    plt.tight_layout()
    plt.show()
    # plt.update_yaxes(tickformat="none").show() # show number as is

In [None]:
aggregate_engagement(aggregation_function='sum')

In [None]:
aggregate_engagement(aggregation_function='mean')

In [None]:
aggregate_engagement(aggregation_function='median')