<a href="https://colab.research.google.com/github/ayenko/sifted/blob/main/Editorial_Content_Analysis_Sep_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import numpy as np

In [None]:
file_path = '/content/editorial_content_sep_23.csv'

In [None]:
# Read the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(file_path, encoding='latin-1')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='cp1252')

In [None]:
df.head()

Unnamed: 0,Page path + query string,Page title,Sifted sector,Sifted category,Article publish date,Article Publish Date New,APD YEAR,APD MONTH,Geography,Country,sifted_membership_level,Views
0,/articles/xlinks-morocco-uk-electricity-cable,The ‘crazy’ £20bn subsea cable to bring Morocc...,Climate Tech,Analysis,2023-08-03,03/08/2023,2023.0,8.0,UK,United Kingdom,Anonymous,25782.0
1,/articles/pitch-deck-mistral,See the pitch memo that raised €105m for four-...,Deeptech,Analysis,2023-06-21,21/06/2023,2023.0,6.0,United States,United States,Anonymous,18343.0
2,/articles/tech-broetry-is-out-of-control,Tech 'broetry' is out of control and needs to ...,Startup Life,Opinion,2023-05-08,08/05/2023,2023.0,5.0,UK,United Kingdom,Anonymous,18159.0
3,/articles/pitch-deck-mistral,See the pitch memo that raised €105m for four-...,Deeptech,Analysis,2023-06-21,21/06/2023,2023.0,6.0,UK,United Kingdom,Anonymous,15969.0
4,/articles/electricity-from-air-tesla-cascatachuva,CascataChuva is building on Nikola Tesla's dre...,Climate Tech,Analysis,2023-08-10,10/08/2023,2023.0,8.0,UK,United Kingdom,Anonymous,12878.0


In [None]:
# Convert the 'Article Publish Date New' column to datetime
df['Article Publish Date New'] = pd.to_datetime(df['Article Publish Date New'], format='%d/%m/%Y')

# Define the date range
start_date = pd.to_datetime('2022-09-01', format='%Y-%m-%d')
end_date = pd.to_datetime('2023-08-31', format='%Y-%m-%d')

# Filter the DataFrame based on the date range and Views
filtered_df = df[(df['Article Publish Date New'] >= start_date) &
                 (df['Article Publish Date New'] <= end_date)
                 & (df['Views'] >= 20)]

# Group by 'Sifted category' and count unique articles
category_counts = filtered_df.groupby('Sifted category')['Page path + query string'].nunique()

# Display the result
print(category_counts)

Sifted category
Analysis        669
How To           49
Interview        57
News            395
Opinion          88
Places            8
Podcast          31
Sifted Talks     13
Name: Page path + query string, dtype: int64


In [None]:
# Handle missing values in 'Sifted sector' column
df['Sifted sector'].fillna('Unknown', inplace=True)

In [None]:
# Define a function to map Sifted sector to Sifted sector group
def map_sector_to_group(sector):
    if 'Advertising' in sector or sector in ['Creative', 'Ecommerce', 'Entertainment', 'Food', 'Media', 'Sustainability']:
        return 'Consumer'
    elif sector in ['Innovation Thought Leaders', 'Innovation How To', 'Innovation Case Studies']:
        return 'Corporate Innovation'
    elif sector in ['Agritech', 'Artificial Intelligence', 'Biotech', 'Blockchain', 'Cybersecurity', 'Data', 'Drones', 'Energy', 'Hardware', 'Industry 4.0', 'Internet of Things', 'Quantum',
                    'Robotics', 'Software & SaaS', 'Telecoms' 'VR/AR'] or 'Software' in sector:
        return 'Deeptech'
    elif sector in ['B2B Fintech', 'Consumer Fintech', 'Cryptocurrency', 'Digital Banks', 'Insurance', 'Payments']:
        return 'Fintech'
    elif sector in ['Digital Health', 'Femtech', 'Mental Health', 'Pharma', 'Care', 'Coronavirus', 'Medtech']:
        return 'Healthtech'
    elif sector in ['Logistics', 'Micromobility', 'Transport', 'Travel', 'Cities']:
        return 'Mobility'
    elif 'Academic'in sector or 'Charity' in sector or sector in ['Edtech', 'Government'] or 'Policy' in sector or 'Universities' in sector:
        return 'Public & Academic'
    elif 'Accelerators' in sector or 'Communities' in sector or 'Diversity' in sector or 'Events' in sector or \
            'Funding' in sector or 'Future of Work' in sector or 'Hiring' in sector or 'Wellbeing' in sector or sector in ['Product', 'Worksapces', 'How To']:
        return 'Startup Life'
    elif sector in ['Angel Investment', 'VC', 'Venture Capital', 'Impact Investment']:
        return 'Venture Capital'
    elif sector in ['Sustainability', 'Climate Tech', 'Greentech']:
        return 'Services'
    elif 'Communications' in sector or sector in ['Finance', 'Consulting', 'Sales']:
        return 'Climate Tech'
    else:
        return sector

In [None]:
# Apply the mapping function to create the new column
df['Sifted sector group'] = df['Sifted sector'].apply(map_sector_to_group)

In [None]:
# Display the DataFrame with the new column
print(df[['Sifted sector', 'Sifted sector group']])

          Sifted sector Sifted sector group
0          Climate Tech            Services
1              Deeptech            Deeptech
2          Startup Life        Startup Life
3              Deeptech            Deeptech
4          Climate Tech            Services
...                 ...                 ...
102694         Deeptech            Deeptech
102695     Startup Life        Startup Life
102696  Venture Capital     Venture Capital
102697          Fintech             Fintech
102698          Unknown             Unknown

[102699 rows x 2 columns]


In [None]:
# Filter the DataFrame based on the date range and Views
filtered_df = df[(df['Article Publish Date New'] >= start_date) &
                 (df['Article Publish Date New'] <= end_date) &
                 (df['Views'] > 20)]

In [None]:
# Group by 'Sifted sector group' and count unique articles
sector_group_counts = filtered_df.groupby('Sifted sector group')['Page path + query string'].nunique()

# Display the result
print(sector_group_counts)

Sifted sector group
Climate Tech           1
Consumer              74
Deeptech             193
Fintech              133
Healthtech            83
Mobility              23
Public & Academic     32
SVB News               3
Services             146
Startup Life         286
Unknown               12
VR/AR                  1
Venture Capital      332
Name: Page path + query string, dtype: int64
