# Exploring Quora data

We are looking to investigate if Forum conversations can be used to extract structured data for XA

In [22]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import textacy
import nltk

from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import *

nlp = en_core_web_sm.load()

In [23]:
rows = pd.read_csv("data/quora.csv")

In [24]:
data = rows.head(1000)

In [25]:
stemmer = PorterStemmer()

def recognizePOS(parsed, POS) :
    words = []
    for token in parsed :
        if token.pos_ == POS :
            word = token.orth_
            words.append(word)
    return words


def extractEntities(text):   
    text = text.lower()
    
    doc = nlp(text)
    
    token_list = []
    for token in doc:
        token_list.append(token.lemma_)
        
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    
    svos = textacy.extract.subject_verb_object_triples(doc)
    doc = nlp(" ".join(filtered_sentence))
    return pd.Series(data=[text, [chunk.text for chunk in doc.noun_chunks], 
                           recognizePOS(doc, "NOUN"),
                           recognizePOS(doc, "VERB"),
                           recognizePOS(doc, "ADJ"),
                           [svo for svo in svos]
                           ]
                    )




In [26]:
nRows = data.apply(lambda row : extractEntities(row["question1"]),axis=1)

In [27]:
nRows = nRows.rename(columns={0: "text", 1: "noun chunks", 2: "nouns", 3: "verbs", 4: "adjs", 5: "Sub/Verb/Obj"})
nRows.head(20)

Unnamed: 0,text,noun chunks,nouns,verbs,adjs,Sub/Verb/Obj
0,what is the step by step guide to invest in sh...,"[step step guide, share market india]","[step, step, guide, share, market, india]",[invest],[],[]
1,what is the story of kohinoor (koh-i-noor) dia...,[koh - - noor ) diamond],"[story, koh, diamond]",[kohinoor],[noor],[]
2,how can i increase the speed of my internet co...,"[speed -PRON- internet connection, vpn]","[speed, internet, connection, use, vpn]",[increase],[],"[((i), (increase), (speed))]"
3,why am i mentally very lonely? how can i solve...,[],[],[solve],[lonely],"[((i), (solve), (it))]"
4,"which one dissolve in water quikly sugar, salt...","[water quikly sugar, salt , methane carbon di ...","[water, sugar, salt, methane, carbon, oxide]",[dissolve],[],[]
5,astrology: i am a capricorn sun cap moon and c...,[capricorn sun cap],"[astrology, sun, cap, moon, cap, rise]",[],[capricorn],"[((i), (am), (cap, moon)), ((i), (am), (cap))]"
6,should i buy tiago?,[],[],[buy],[],"[((i), (buy), (tiago))]"
7,how can i be a good geologist?,[good geologist],[geologist],[],[good],"[((i), (be), (geologist))]"
8,when do you use シ instead of し?,"[シ, し]",[シ],[use],[],"[((you), (use), (シ)), ((you), (use), (し))]"
9,motorola (company): can i hack my charter moto...,"[motorola, company]","[motorola, company, -PRON-, charter, motorolla]",[hack],[],"[((i), (hack), (dcx3400))]"


In [32]:
term = "internet"

fRows = nRows.copy()
fRows["found"] = nRows["nouns"].apply(lambda l: l.count(term))

indexNames = fRows[ fRows['found'] == 0 ].index
fRows.drop(indexNames , inplace=True)
fRows


Unnamed: 0,text,noun chunks,nouns,verbs,adjs,Sub/Verb/Obj,found
2,how can i increase the speed of my internet co...,"[speed -PRON- internet connection, vpn]","[speed, internet, connection, use, vpn]",[increase],[],"[((i), (increase), (speed))]",1
78,how can i make money through the internet?,[money internet],"[money, internet]",[],[],"[((i), (make), (money))]",1
470,my new xbox one s can't connect to internet du...,"[new xbox, internet setup, solution]","[xbox, internet, setup, solution]",[connect],[new],[],1
720,hypothetical scenario: our actual credit card ...,"[hypothetical scenario, actual credit card sys...","[scenario, credit, card, system, banking, curr...","[exist, implement]","[hypothetical, actual, digital]","[((you), (implement), (currency))]",1
730,is there a different price over the internet t...,"[different price internet person, book room me...","[price, internet, person, book, room, mexico, ...",[try],[different],[],1
