In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('sample keywords with syntax - title.csv')

# read 'list string' as list for the following columns
import ast

df['pos'] = df['pos'].apply(lambda x: ast.literal_eval(x) if type(x)==str else '')
df['lemma'] = df['lemma'].apply(lambda x: ast.literal_eval(x) if type(x)==str else '')
df['label'] = df['label'].apply(lambda x: ast.literal_eval(x) if type(x)==str else '')

df.sample(3)

Unnamed: 0,query,title,pos,lemma,label
1890,auto insurance discounts for seniors,People Also Ask,"[NOUN, ADV, VERB]","[person, Also, Ask]","[NSUBJ, ADVMOD, ROOT]"
7141,crest 3d coupons,Crest Coupons for Sep 2019 - $15.00 Off - Coup...,"[VERB, NOUN, ADP, NOUN, NUM, PUNCT, NUM, NOUN,...","[Crest, coupon, for, Sep, 2019, -, $15.00, Off...","[NN, ROOT, PREP, POBJ, NUM, P, NUM, NN, P, APPOS]"
4125,can you buy car insurance online,People Also Ask,"[NOUN, ADV, VERB]","[person, Also, Ask]","[NSUBJ, ADVMOD, ROOT]"


In [2]:
# drop <people also ask> and <video pack> records
df = df.loc[~df['title'].isin(['People Also Ask', 'Video Pack'])]

# drop title = nan
df['type'] = df['title'].apply(lambda x: type(x)==str)
df = df[df['type'] == True]
df = df.drop(['type'], axis=1).reset_index(drop=True)

## Aggregate the variables at query level by migrating into a new dataframe

In [3]:
df_new = pd.DataFrame(df['query'].drop_duplicates()).reset_index(drop=True)

import itertools

items = ['pos', 'lemma', 'label']
    
for item in items:
    
    # create new column
    df_new[item] = None
    
    # combine the lists for each query
    for i in range(df_new.shape[0]):
        
        query = df_new['query'][i]
        query_pos = df.loc[df['query'] == query, item]
    
        query_pos = sum(list(itertools.chain(query_pos)), [])
        df_new.loc[i, item] = query_pos

df_new.sample(3)

Unnamed: 0,query,pos,lemma,label
957,how to get rid of horrible breath,"[NUM, NOUN, PRT, VERB, VERB, ADP, NOUN, NOUN, ...","[9, Ways, to, Get, Rid, of, Bad, Breath, -, We...","[NUM, ROOT, AUX, VMOD, DEP, PREP, NN, NN, P, P..."
1615,what constitutes leadership,"[VERB, PRON, VERB, DET, NOUN, NOUN, ADV, VERB,...","[Learn, What, constitute, A, Great, Leader, Th...","[ROOT, NSUBJ, CCOMP, DET, NN, NSUBJ, ADVMOD, C..."
626,crest pro health clinical mouthwash reviews,"[VERB, ADJ, ADJ, VERB, NOUN, NOUN, NOUN, VERB,...","[Crest, Pro-Health, Clinical, Rinse, Mouthwash...","[ROOT, AMOD, AMOD, NN, NN, NN, DOBJ, ROOT, AMO..."


### Make a dictionary storing entities and their types for reference

In [4]:
title_syntax_dictionary = pd.DataFrame()

for item in items:
    title_syntax_dictionary[item] = sum(list(itertools.chain(df_new[item])), [])

title_syntax_dictionary = title_syntax_dictionary.drop_duplicates().reset_index(drop=True)
# title_syntax_dictionary.to_csv('title syntax dictionary.csv', index=False)
title_syntax_dictionary.sample(3)

Unnamed: 0,pos,lemma,label
479,NOUN,Simple,NN
9416,NOUN,Office,CONJ
4839,NOUN,Overview,NN


In [5]:
pd.crosstab(title_syntax_dictionary['label'], title_syntax_dictionary['pos'])

pos,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,PUNCT,VERB,X
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ACOMP,59,0,0,0,0,1,0,0,0,0,1,0
ADVCL,1,0,0,0,0,1,0,0,0,0,35,0
ADVMOD,8,15,117,1,0,1,0,0,5,0,0,1
AMOD,431,0,6,0,0,45,0,0,1,0,60,0
APPOS,9,0,0,0,4,667,24,1,1,0,1,31
ATTR,4,0,0,0,2,56,2,5,0,0,0,1
AUX,0,2,0,0,0,1,0,0,3,0,24,0
AUXPASS,0,0,0,0,0,0,0,0,0,0,3,0
CC,0,0,0,11,0,0,0,0,0,0,0,1
CCOMP,0,0,0,0,0,1,0,0,0,0,49,0


## Create Part of Speech Features

In [6]:
# Count the appearances of adj & adv for each query
df_new['adj adv'] = df_new['pos'].apply(lambda x: round(100*(x.count('ADJ')+x.count('ADV'))/len(x), 2))

df_new.sample(3)

Unnamed: 0,query,pos,lemma,label,adj adv
1226,orajel price,"[NOUN, NOUN, NOUN, NOUN, NOUN, NOUN, NOUN, PUN...","[Orajel, |, Walgreens, Orajel, Maximum, Streng...","[NN, NN, ROOT, NN, NN, NN, ROOT, P, PREP, POBJ...",6.15
1649,what happens if you cancel car insurance,"[VERB, PRON, VERB, NOUN, NOUN, NOUN, PUNCT, PU...","[Can, I, Cancel, Car, Insurance, Anytime, ?, -...","[AUX, NSUBJ, ROOT, NN, NN, DOBJ, P, P, DET, RO...",4.88
1748,where can you buy spry gum,"[NOUN, NOUN, PUNCT, NOUN, NOUN, NOUN, NOUN, NO...","[Spry, Gum, -, Walmart.com, Spry, Fresh, Natur...","[NN, NN, P, ROOT, ROOT, NN, NN, DEP, DEP, P, A...",3.23


In [7]:
# More features TBD

In [8]:
df_new = df_new[['query', 'adj adv']]
df_new.to_csv('syntax for model.csv', index=False)