### Analyzing the language in news articles

How are accidents involving pedestrians and cyclists described?

In [1]:
import pandas as pd
import spacy

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
def is_passive(sent) -> bool:
    '''
    TODO this is just something I put together in a very adhoc way and
    is probably not great, although sort of seems to do OK. Should improve.
    '''
    doc = nlp(sent)
    
    tags = [token.dep_ for token in doc]
    
    contains_agent = 'agent' in tags
    contains_nsubjpass = 'nsubjpass' in tags
    contains_aux_verb = 'auxpass' in tags 
    
    return contains_agent or contains_nsubjpass or contains_aux_verb

In [4]:
is_passive("A pedestrian was struck")

True

In [5]:
is_passive("A car struck a pedestrian")

False

This data was scraped from https://www.newsbreak.com/. @TODO George can fill in details.

In [6]:
all_df = pd.read_csv("articles-all.csv")
all_df.head()

Unnamed: 0.1,Unnamed: 0,Date,City,State,Title,Short Description,Long Description,Involves cyclist or pedestrian,URL
0,0,2022-01-12,Cleves,OH,2 injured in wrong-way crash on I-275 in Kento...,"KENTON COUNTY, Ky. — Two people were injured i...","KENTON COUNTY, Ky. — Two people were injured i...",False,https://www.newsbreak.com/news/2483276062044/2...
1,1,2022-01-11,Cleves,OH,"Pedestrian hit, killed in West Chester on Satu...",A spokesperson for West Chester Township said ...,"Read the latest Cincinnati, Ohio news and weat...",True,https://www.newsbreak.com/news/2483036184238/p...
2,2,2022-01-11,Cleves,OH,Officials working to determine cause of fiery ...,"KENTON COUNTY, Ky. (WXIX) - The fiery explosio...","KENTON COUNTY, Ky. (WXIX) - The fiery explosio...",False,https://www.newsbreak.com/news/2483089711632/o...
3,3,2022-01-12,Cleves,OH,Injuries reported after crash on I-275 in Nort...,"FT. WRIGHT, Ky. (WKRC) - Injuries were reporte...","FT. WRIGHT, Ky. (WKRC) - Injuries were reporte...",False,https://www.newsbreak.com/news/2483307274479/i...
4,4,2022-01-11,Cleves,OH,Covington residents urged to shelter in place ...,"KENTON COUNTY, Ky. (WXIX) - A shelter-in-place...","KENTON COUNTY, Ky. (WXIX) - A shelter-in-place...",False,https://www.newsbreak.com/news/2482198518315/c...


In [7]:
all_df.shape

(26921, 9)

In [8]:
all_df = all_df.rename(columns={'Involves cyclist or pedestrian': 'cyclist_or_ped'})

Assemble a column with predictions regarding passive/active voice for each headline.

In [9]:
is_passive_column = []
for title in all_df["Title"].values:
    is_passive_column.append(int(is_passive(title)))
all_df["passive"] = is_passive_column

In [10]:
log_reg = smf.logit("passive ~ cyclist_or_ped", data=all_df).fit()

Optimization terminated successfully.
         Current function value: 0.237945
         Iterations 7


In [11]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:                passive   No. Observations:                26921
Model:                          Logit   Df Residuals:                    26919
Method:                           MLE   Df Model:                            1
Date:                Fri, 08 Jul 2022   Pseudo R-squ.:                 0.02550
Time:                        09:47:02   Log-Likelihood:                -6405.7
converged:                       True   LL-Null:                       -6573.3
Covariance Type:            nonrobust   LLR p-value:                 6.993e-75
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -2.7591      0.026   -105.169      0.000      -2.810      -2.708
cyclist_or_ped[T.True]     1.6333      0.078     20.808      0.000       1.479       1.787


In [22]:
def get_subjs(sent) -> list:
    doc = nlp(sent)
    subj_list = []
    
    for token in doc:
        if token.dep_ == "nsubj":
            subj_list.append(str(token))
            
    return subj_list

In [24]:
sent = "The driver hit a pedestrian"
get_subjs(sent)

['driver']

In [29]:
vru_df = all_df[all_df["cyclist_or_ped"]==True]
vru_df.shape

(988, 10)

In [36]:
all_subjects = []
for title in all_df["Title"].values:
    all_subjects.append(", ".join(get_subjs(title)).lower())
all_df["subjects"] = all_subjects

In [37]:
all_subjects

['',
 '',
 '',
 'injuries',
 'residents',
 '2',
 'pedestrian',
 '',
 'fire',
 'police, driver',
 '',
 'shelter',
 'investigators, doors',
 'flames',
 'crash',
 'fire',
 'plane',
 'shelter',
 'teen',
 'teen',
 'fire',
 'teens',
 '',
 '',
 '1, vehicle',
 'collision',
 '',
 '',
 '',
 'toddler, sister, mom',
 '',
 'responders',
 'truck',
 'truck',
 'vehicles',
 'fire',
 'road, car',
 'fire',
 'road',
 '',
 'fire',
 'responders',
 'house, neighbor',
 '',
 '',
 '',
 'fatality',
 'crash',
 '',
 '',
 'crews',
 'car',
 'bus, injuries',
 '',
 'community',
 'lanes',
 'fire',
 'police, person',
 '',
 'shop',
 '',
 '',
 'crews',
 'fire',
 'crash',
 'crash',
 'person',
 'cocktail, queens, firefighters',
 'cruiser',
 'victims, youngest',
 '',
 'cruiser',
 'fire',
 'crash, fdny',
 'victims',
 '',
 'victims',
 'inhalation',
 'old, officials',
 '',
 'families, these',
 'examiner, victims',
 'examiner',
 '',
 '',
 'car',
 'man, 2',
 '',
 '',
 'crash',
 'accident, others',
 'crews',
 'people',
 '',
 '',
 

In [43]:
driver, car = [], []
for subjs in all_df[all_df["cyclist_or_ped"]==True]["subjects"].values:
    if "driver" in subjs.split(","):
        driver.append(subjs)
    elif "car" in subjs.split(","):
        car.append(subjs)
        

In [44]:
len(driver)

13

In [45]:
len(car)

9

In [46]:
car

['car',
 'car, woman',
 'car',
 'car',
 'car',
 'car, bicyclist, spd',
 'car',
 'car',
 'car']

In [42]:
car

['car',
 'car, woman',
 'car',
 'car',
 'car',
 'car, bicyclist, spd',
 'pedestrian, car',
 'car',
 'car',
 'carthage, motorcyclist',
 'car']