# HW07: Parsing

Remember that these homework work as a completion grade. **You can skip one section of this homework.**

In [None]:
import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000) # # only use 10K datapoints
df.head()

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

#TODO preprocess the corpus using spacy or load the pre-processed corpus
def preprocess(text):
    return [wd.lemma_.lower() for wd in list(nlp(text)) if not wd.is_punct and not wd.is_stop]

df['tokens'] = df['text'].apply(lambda x: preprocess(x))
df['processed'] = df['text'].apply(lambda x: nlp(x))

In [None]:
df.head()

### Information Extraction

In [None]:
def extract_subject_verb_pairs(sent):
    subjs = [word for word in sent if word.dep_ == "nsubj"]
    # extracting lemmatized words, where head is the verb
    pairs = [(word.lemma_.lower(), word.head.lemma_.lower()) for word in subjs]
    return pairs

##TODO extract the subject-verbs pairs and print the result for the first document
df['sbj_vb_pair'] = df['processed'].apply(lambda x: extract_subject_verb_pairs(x))

from collections import Counter
counter = Counter()

for pair in df['sbj_vb_pair']:
    counter.update(pair)
    
# print the result for the first document
print(f"The result for the first document is {df['sbj_vb_pair'].iloc[0]}")
print(" ")

##TODO create a list ranking the most common pairs and print the first 10 items
print("The most common pairs are: ")
for pair, counts in counter.most_common(n=10):
    print(pair, counts)

In [None]:
##TODO do the same for verbs-object pairs ('dobj')
def extract_verb_object_pairs(sent):
    objs = [w for w in sent if w.dep_ == "dobj"]
    pairs = [(w.head.lemma_.lower(), w.lemma_.lower()) for w in objs]
    return pairs

df['vb_obj_pair'] = df['processed'].apply(lambda x: extract_verb_object_pairs(x))

##TODO create a list ranking the most common pairs and print the first 10 items
counter = Counter()

for vb_obj_pair in df['vb_obj_pair']:
    counter.update(vb_obj_pair)

print("The most common pairs are: ")
for vb_obj_pair, counts in counter.most_common(n=10):
    print(vb_obj_pair, counts)

In [None]:
##TODO do the same for adjectives-nouns pairs ('amod')
##TODO create a list ranking the most common pairs and print the first 10 items

def extract_adj_noun_pairs(sent):
    adjns = [w for w in sent if w.dep_ == "amod"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in adjns]
    return pairs

df['adj_noun_pair'] = df['processed'].apply(lambda x: extract_adj_noun_pairs(x))

##TODO create a list ranking the most common pairs and print the first 10 items

counter = Counter()

for adj_noun_pair in df['adj_noun_pair']:
    counter.update(adj_noun_pair)
    
print("The most common pairs are: ")
for adj_noun_pair, counts in counter.most_common(n=10):
    print(adj_noun_pair, counts)

### Exploring cross label dependencies

In [None]:
##TODO extract all the subject-verbs and verbs-object pairs for the verb "win"
def search_verbs(x): return [pair for pair in x if "win" in pair]

df['sbj_vb_win'] = df['sbj_vb_pair'].apply(lambda x: search_verbs(x))
df['vb_obj_win'] = df['vb_obj_pair'].apply(lambda x: search_verbs(x))

In [None]:
# all subject-verbs pairs for the verb "win"
[list(set(value)) for value in df.sbj_vb_win.values if len(value) != 0 ]

In [None]:
# all verb-object pairs for the verb "win"
[list(set(value)) for value in df.vb_obj_win.values if len(value) != 0 ]

In [None]:
def most_common_vb_obj(df, label):
    counter = Counter()
    
    for pair in df[df.label == label]['vb_obj_pair']:
        counter.update(pair)
    
    out = pd.DataFrame()
    row = dict()
    for pair, counts in counter.most_common():

        row['pair'] = pair
        row['count'] = counts
        out = out.append(row, ignore_index=True)
    return out

In [None]:
def most_common_sbj_vb(df, label):
    counter = Counter()
    
    for pair in df[df.label == label]['sbj_vb_pair']:
        counter.update(pair)
    
    out = pd.DataFrame()
    row = dict()
    for pair, counts in counter.most_common():

        row['pair'] = pair
        row['count'] = counts
        out = out.append(row, ignore_index=True)
    return out

In [None]:
##TODO for each label create a list ranking the most common subject-verbs pairs and one for the most common verbs-object pairs
sport_vb_obj = most_common_vb_obj(df, "sport")
business_vb_obj = most_common_vb_obj(df, "business")
sci_tech_vb_obj = most_common_vb_obj(df, "sci/tech")
world_vb_obj = most_common_vb_obj(df, "world")

sport_sbj_vb = most_common_sbj_vb(df, "sport")
business_sbj_vb = most_common_sbj_vb(df, "business")
sci_tech_sbj_vb = most_common_sbj_vb(df, "sci/tech")
world_sbj_vb = most_common_sbj_vb(df, "world")

In [None]:
##TODO print the 10 most common pairs for each of the two lists for the labels "sport" and "business"
print("Most common verbs-object pairs for the label sport: ")
print(sport_vb_obj.head(10))
print("Most common verbs-object pairs for the label business: ")
print(business_vb_obj.head(10))
print("Most common subject-verbs pairs for the label sport: ")
print(sport_sbj_vb.head(10))
print("Most common subject-verbs pairs for the label business: ")
print(business_sbj_vb.head(10))