In [1]:
!pip install spacy pandas pyarrow
import pandas as pd
import spacy
import re
from collections import Counter



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m128.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
from google.colab import files
uploaded = files.upload()

Saving malawi.parquet to malawi.parquet


In [5]:
df_malawi = pd.read_parquet("malawi.parquet")
df_malawi.head()

Unnamed: 0,news_id,date,text
0,1,2019-01-11T10:32:21,Heavy rains which fell for about three hours i...
1,2,2019-03-10T09:48:43,The Department of Disaster Management Affairs ...
2,3,2019-03-12T17:18:22,UN Secretary-General António Guterres has exte...
3,4,2019-03-13T03:08:25,President Peter Mutharika on Tuesday assured M...
4,5,2019-03-15T01:43:40,"From smiles of rains that crops are to grow, s..."


In [6]:
df_malawi.columns

Index(['news_id', 'date', 'text'], dtype='object')

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
texts = df_malawi['text'].dropna().head(100)

entity_labels = []

for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    entity_labels.extend([ent.label_ for ent in doc.ents])




In [9]:
from collections import Counter

label_counts = Counter(entity_labels)

df_entities = pd.DataFrame(label_counts.items(), columns=["Entity_Label", "Count"])
df_entities.sort_values(by="Count", ascending=False, inplace=True)
df_entities.reset_index(drop=True, inplace=True)
df_entities


Unnamed: 0,Entity_Label,Count
0,ORG,776
1,PERSON,770
2,CARDINAL,456
3,GPE,342
4,DATE,296
5,NORP,131
6,LOC,66
7,QUANTITY,39
8,TIME,38
9,MONEY,30


In [10]:
df_entities.head(10)

Unnamed: 0,Entity_Label,Count
0,ORG,776
1,PERSON,770
2,CARDINAL,456
3,GPE,342
4,DATE,296
5,NORP,131
6,LOC,66
7,QUANTITY,39
8,TIME,38
9,MONEY,30


In [11]:
person_names = []

for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    person_names.extend([ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"])




In [12]:
vip_counts = Counter(person_names)

df_vips = pd.DataFrame(vip_counts.items(), columns=["Person", "Mentions"])
df_vips.sort_values(by="Mentions", ascending=False, inplace=True)
df_vips.reset_index(drop=True, inplace=True)

df_vips.head(10)


Unnamed: 0,Person,Mentions
0,malawi,139
1,sharing,59
2,chikwawa,43
3,dausi,19
4,phalombe,16
5,nicholas dausi,16
6,zomba,13
7,mulanje,11
8,mia,11
9,peter mutharika,9


### Q2 Report Commentary: VIP Mentions and NER Accuracy

Using SpaCy’s `en_core_web_sm` model, we extracted all entities labeled as `PERSON` from the Malawi flood news dataset and grouped them by lowercase text. The top-mentioned entities include **"malawi" (139 mentions)**, **"chikwawa"**, and **"phalombe"** — all of which are actually **geographical locations**, not individuals. This highlights a key limitation of the small SpaCy model: it often misclassifies **place names and local terms** as people, especially in low-resource, non-Western contexts. True VIPs such as **"nicholas dausi"**, **"mia"**, and **"peter mutharika"** were correctly identified but appeared alongside several false positives. To improve accuracy, one could use a more robust model like `en_core_web_trf`, incorporate region-specific gazetteers, or fine-tune the NER system on annotated local news data.
