In [1]:
import pandas as pd
import numpy as np

In [2]:
politics_news = pd.read_csv("../Datasets/TGC/Politics/TGC_Politics.csv")
society_news = pd.read_csv("../Datasets/TGC/Society/TGC_Society.csv")
world_news = pd.read_csv("../Datasets/TGC/World/TGC_World.csv")
uk_news = pd.read_csv("../Datasets/TGC/UK/TGC_UK.csv")

In [3]:
combined_dataset = pd.concat([politics_news, society_news, world_news, uk_news])

In [4]:
combined_dataset = combined_dataset[combined_dataset['Author'] != "Unknown"]

In [5]:
combined_dataset = combined_dataset[combined_dataset['Text_Length'] >= 500]
combined_dataset = combined_dataset[combined_dataset['Text_Length'] <= 5000]

In [6]:
combined_dataset.groupby('Section')['Text_Length'].agg(['max', 'min'])

Unnamed: 0_level_0,max,min
Section,Unnamed: 1_level_1,Unnamed: 2_level_1
Politics,4996,792
Society,5000,694
UK news,4995,606
World news,4999,652


### Author Summary

In [7]:
author_summary = combined_dataset.groupby(["Author"])['Content'].nunique()

In [8]:
author_summary = author_summary.reset_index()

In [9]:
def article_count(size, df):
    return len(df[df['Content'] >= size])

In [10]:
author_article_counts = {}
for size in [5, 10, 15, 20, 25, 50, 100]:
    author_article_counts["Number of authors with articles > "+str(size)] = article_count(size,author_summary)

In [11]:
author_article_counts

{'Number of authors with articles > 5': 147,
 'Number of authors with articles > 10': 100,
 'Number of authors with articles > 15': 82,
 'Number of authors with articles > 20': 65,
 'Number of authors with articles > 25': 55,
 'Number of authors with articles > 50': 32,
 'Number of authors with articles > 100': 4}

### Section Level Summary

In [12]:
combined_dataset.groupby("Section")["Author"].nunique()

Section
Politics      125
Society       179
UK news       151
World news    226
Name: Author, dtype: int64

In [13]:
section_summary = combined_dataset.groupby(["Section", "Author"])['Content'].nunique()

In [14]:
section_summary = section_summary.reset_index()

In [15]:
section_article_counts = {}
for size in [5, 10, 15, 20, 25, 50, 100]:
    section_article_counts["Number of authors with articles > "+str(size)] = article_count(size, section_summary)

In [16]:
section_article_counts

{'Number of authors with articles > 5': 231,
 'Number of authors with articles > 10': 128,
 'Number of authors with articles > 15': 86,
 'Number of authors with articles > 20': 63,
 'Number of authors with articles > 25': 47,
 'Number of authors with articles > 50': 15,
 'Number of authors with articles > 100': 1}

In [17]:
section_summary

Unnamed: 0,Section,Author,Content
0,Politics,Aamna Mohdin,3
1,Politics,Aaron Walawalkar,2
2,Politics,Alex Hern,6
3,Politics,Alexandra Topping,6
4,Politics,Alison Rourke,2
...,...,...,...
676,World news,Warren Murray,1
677,World news,Weronika Strzyżyńska,2
678,World news,Will Dean,2
679,World news,William Costa,1


In [18]:
def count_authors(group):
    content_counts = group['Content'].count()
    return pd.Series({
        '>5': (content_counts >= 5).sum(),
        '>10': (content_counts >= 10).sum(),
        '>15': (content_counts >= 15).sum(),
        '>20': (content_counts >= 20).sum(),
        '>25': (content_counts >= 25).sum(),
        '>50': (content_counts >= 50).sum(),
        '>100': (content_counts >= 100).sum(),
    })

In [19]:
ac = section_summary.groupby(['Section', 'Author']).apply(count_authors).reset_index()

# Pivot the table to get the desired format
result = ac.pivot_table(index='Section', aggfunc='sum')


In [20]:
# Define the thresholds
thresholds = [5, 10, 15, 20, 25, 50, 100]

# Function to count authors exceeding thresholds
def count_authors_exceeding_thresholds(sub_df):
    counts = {}
    for threshold in thresholds:
        counts[f'Authors_with_content_over_{threshold}'] = (sub_df['Content'] >= threshold).sum()
    return pd.Series(counts)

# Apply the function for each section
result = section_summary.groupby('Section').apply(count_authors_exceeding_thresholds)


In [21]:
result

Unnamed: 0_level_0,Authors_with_content_over_5,Authors_with_content_over_10,Authors_with_content_over_15,Authors_with_content_over_20,Authors_with_content_over_25,Authors_with_content_over_50,Authors_with_content_over_100
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Politics,52,26,19,15,12,4,0
Society,52,29,20,13,10,4,0
UK news,60,39,27,22,19,6,1
World news,67,34,20,13,6,1,0


In [22]:
sum(combined_dataset.Text_Length)

16825114

In [23]:
np.mean(combined_dataset.Text_Length)

3528.0171943803734

In [24]:
combined_dataset.shape

(4769, 8)

In [25]:
author_sections = combined_dataset.groupby("Author")["Section"].nunique()

In [26]:
author_sections = author_sections.reset_index()

In [27]:
author_sections[author_sections['Section'] > 2]

Unnamed: 0,Author,Section
0,Aamna Mohdin,4
1,Aaron Walawalkar,4
6,Alex Hern,4
7,Alexandra Topping,4
12,Amelia Gentleman,3
...,...,...
337,Toby Helm,3
341,Tom Wall,4
344,Vanessa Thorpe,3
347,Vikram Dodd,3


In [29]:
combined_dataset.to_csv("../Datasets/TGC/combined_data.csv")

In [30]:
combined_dataset.shape

(4769, 8)