<a href="https://colab.research.google.com/github/anaborne/RavenPack-Data-Aggregation/blob/main/updated_ravenpack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# 1. Read the Excel file
df = pd.read_excel('RP_Sentiment_Data.xlsx', engine='openpyxl')

# 2. Identify the first column
first_col = df.columns[0]

# 3. Convert it to datetime (coerce errors to NaT) and extract the year
df[first_col] = pd.to_datetime(df[first_col], errors='coerce').dt.year

df.head(10)

Unnamed: 0,rpa_date_utc,rp_entity_id,entity_name,country_code,event_sentiment_score,composite_sentiment_score
0,2000,0E1492,Glencore PLC,CH,0.39,0.04
1,2000,0E1492,Glencore PLC,CH,,0.02
2,2000,0E1492,Glencore PLC,CH,,0.1
3,2000,0E1492,Glencore PLC,CH,0.39,0.04
4,2000,0E1492,Glencore PLC,CH,0.22,0.04
5,2000,0E1492,Glencore PLC,CH,0.39,0.04
6,2000,0E1492,Glencore PLC,CH,0.22,0.04
7,2000,0E1492,Glencore PLC,CH,,0.0
8,2000,0E1492,Glencore PLC,CH,,0.0
9,2000,0E1492,Glencore PLC,CH,,0.04


In [2]:
grouped = df.groupby(['entity_name', 'rpa_date_utc'])
# └─ split the table into sub‐tables for each unique (company, year) combination

# 3. Aggregate counts of positive, negative, and zero, plus averages:
summary = grouped.agg(
    event_pos_count   = ('event_sentiment_score',     lambda x: (x > 0).sum()),  # count event scores greater than zero
    event_neg_count   = ('event_sentiment_score',     lambda x: (x < 0).sum()),  # count event scores less than zero
    event_zero_count  = ('event_sentiment_score',     lambda x: (x == 0).sum()), # count event scores equal to zero
    comp_pos_count    = ('composite_sentiment_score', lambda x: (x > 0).sum()),  # count composite scores greater than zero
    comp_neg_count    = ('composite_sentiment_score', lambda x: (x < 0).sum()),  # count composite scores less than zero
    comp_zero_count   = ('composite_sentiment_score', lambda x: (x == 0).sum()), # count composite scores equal to zero
    event_avg         = ('event_sentiment_score',     'mean'),                  # compute the average of event scores
    comp_avg          = ('composite_sentiment_score', 'mean')                   # compute the average of composite scores
)

# 4. Turn the grouped index back into columns:
summary = summary.reset_index()
# └─ bring "company" and "year" back as normal columns instead of an index

# 5. Sort so each company's years appear together:
final_df = summary.sort_values(by=['entity_name', 'rpa_date_utc']).reset_index(drop=True)
# └─ order the rows first by company name, then by year, and reset row numbering


In [3]:
df_map = pd.read_excel('rp_id_matched.xlsx', engine='openpyxl')
final_df = final_df.merge(
    df_map[['entity_name', 'exportergroup']],  # take only the two relevant columns
    on='entity_name',                          # match rows by entity_name
    how='left'                                 # keep all sentiment rows, even if no match
)
final_df = final_df.rename(columns={'exportergroup': 'exporter_group'})
final_df.to_excel('final_RP_data.xlsx', index=False)