# Structured data from Forums

We are looking to investigate if Forum conversations can be used to extract structured data for XA

In [None]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import textacy
import nltk

from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import *

nlp = en_core_web_sm.load()

In [None]:
rows = pd.read_csv("data/quora.csv")

In [None]:
data = rows.head(1000)

In [None]:
stemmer = PorterStemmer()

def recognizePOS(parsed, POS) :
    words = []
    for token in parsed :
        if token.pos_ == POS :
            word = token.orth_
            words.append(word)
    return words


def extractEntities(text):   
    text = text.lower()
    
    doc = nlp(text)
    
    token_list = []
    for token in doc:
        token_list.append(token.lemma_)
        
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    
    svos = textacy.extract.subject_verb_object_triples(doc)
    doc = nlp(" ".join(filtered_sentence))
    return pd.Series(data=[text, [chunk.text for chunk in doc.noun_chunks], 
                           recognizePOS(doc, "NOUN"),
                           recognizePOS(doc, "VERB"),
                           recognizePOS(doc, "ADJ"),
                           [svo for svo in svos]
                           ]
                    )




In [None]:
nRows = data.apply(lambda row : extractEntities(row["question1"]),axis=1)

In [None]:
nRows = nRows.rename(columns={0: "text", 1: "noun chunks", 2: "nouns", 3: "verbs", 4: "adjs", 5: "Sub/Verb/Obj"})
nRows.head(60)

In [None]:
term = "speed"

fRows = nRows.copy()
fRows["found"] = nRows["nouns"].apply(lambda l: l.count(term))

indexNames = fRows[ fRows['found'] == 0 ].index
fRows.drop(indexNames , inplace=True)
fRows
