In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer

from html.parser import HTMLParser

In [2]:
## data with the real text meat
prof = pd.read_csv("../professionals.csv")
answ = pd.read_csv("../answers.csv")
stud = pd.read_csv("../students.csv")
ques = pd.read_csv("../questions.csv")

In [3]:
# Professional's headlines - clean them!!!
def clean_names(headline): 
    if type(headline) == str:
        return re.sub(r'at\s.*$', '', headline)
    return headline

# Updated the city columns 
prof.professionals_headline = prof.professionals_headline.apply(clean_names)
prof.sample(2)

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
665,d3d09b7c216b465086d43c779d2087cd,"Boston, Massachusetts",Marketing and Advertising,Account Manager,2013-12-05 14:57:34 UTC+0000
12006,545868bb34a3466d9acdc01a89907a90,"Washington, Washington",Management Consulting,Management Consulting Analyst,2017-06-26 19:59:33 UTC+0000


In [4]:
## Function to get rid of HTML tags from text
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(str(d))
    def get_data(self):
        return ''.join(self.fed)
    
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [5]:
# Clean up these bodies of text

# Tag parts of speech and remove stop words
# More interested in knowing which general area these questions are asking about so let's grab nouns
stop_words = set(stopwords.words("english"))
def filterText(text):
    # initialize stop words bank
    stop_words = set(stopwords.words("english"))
    
    # strip html! 
    text = strip_tags(str(text).lower())
    text = (nltk.word_tokenize(text))
    #print('Finished tokenizing body text')
    
    text = nltk.pos_tag(text)
    #print('Finished taging parts of speech')
    
    res = ""
    for i in text: 
        if (i[0] not in stop_words) and (i[1] == "NN" or i[1] == "NNP" or i[1] == "NNS" or i[1] == "NNPS"):
            res += i[0] + " "
    return res
print(ques.sample(3))
ques.questions_body = ques.questions_body.apply(filterText)


                           questions_id               questions_author_id  \
8843   c58917388bb449b3ae3a3e5d10a88a79  b9969e10475c4915961edbcd3ef3c68e   
23652  3795aeef19214b8abf46f305837ef322  328ea047f8ed4ca8864a4b1c09272587   
11431  ad24f99be877470d9003029569e8a58d  18007164ed2e4ec08ae910f53689cafe   

               questions_date_added  \
8843   2017-05-30 20:29:04 UTC+0000   
23652  2015-06-26 22:39:13 UTC+0000   
11431  2017-09-01 16:10:40 UTC+0000   

                                         questions_title  \
8843                   Are there majors for metaphysics?   
23652  What jobs can I get from the basketball in NBA...   
11431  What is the best school for future physical th...   

                                          questions_body  
8843   Interested in metaphysics #biology #physics #p...  
23652  Because I love to play basketball. Almost no b...  
11431  I am thinking about being a PTA and I want a s...  


In [21]:
# Let's do the same for question titles
ques.questions_title = ques.questions_title.apply(filterText)


In [22]:
print(ques.sample(2))
## Sweet! It works. Now we can apply some other tools to this cleaned text

                           questions_id               questions_author_id  \
13     fecd4c7f68144042abca8672b6114a36  3acd97cf60da4b23950fb316644da839   
19343  da257843f8594642892d0093f255125b  350cf806f9be45d48a5869c0702b45eb   

               questions_date_added questions_title  \
13     2018-01-15 22:03:20 UTC+0000         lawyer    
19343  2016-05-18 00:14:32 UTC+0000        college    

                                   questions_body  
13     process lawyer lsat law school job lawyer   
19343         college fall cost question finance   


Let's merge these data sets together a la reformat_careervillage_data.ipynb

In [24]:
prof_merged = pd.merge(prof, answ, left_on='professionals_id', right_on='answers_author_id', how='inner')
keep = ['professionals_id', 'professionals_headline','professionals_industry', 'answers_question_id']
prof_merged = prof_merged[keep]

stud_merged = pd.merge(stud, ques, left_on='students_id', right_on='questions_author_id')
keep2 = ['questions_id', 'questions_title', 'questions_body']
stud_merged = stud_merged[keep2]

final_merged = pd.merge(prof_merged, stud_merged, left_on='answers_question_id', right_on='questions_id')


In [27]:
final_merged.sample(5)
# Some interesting professional headlines that I never heard about answering questions in fields that I think are disjoint? I 

Unnamed: 0,professionals_id,professionals_headline,professionals_industry,answers_question_id,questions_id,questions_title,questions_body
17937,ef2c0f7cacf348f1b083f5d5cd7803f9,Civil Engineer,Civil Engineering,29777d7e3fd344a1a9300eb0fa233cc0,29777d7e3fd344a1a9300eb0fa233cc0,engineering work,engineering work time civil-engineering
4863,c0f7489f842f4ebfa548af41402d9571,Farmer & Food Advocate,Farming,db41de0055104c85997e541fb9917e61,db41de0055104c85997e541fb9917e61,college career,chemical engineering fall chemical engineer ty...
38681,81da0aca08c745058e67265aabb6d860,Sr. Specialist - Digital Assistant Program Man...,Telecommunications,9f48e738b0944c6993edf49414cd5946,9f48e738b0944c6993edf49414cd5946,children countries,lot documentaries children world bangladesh th...
20802,3bf31b002c3b4635a658835035b1d8ff,Talent Management and Strategic Recruiting,Human Resources,036130ae9d304cd5a6f475c2e83ecb11,036130ae9d304cd5a6f475c2e83ecb11,advice business scratch,optometrist practice day studies sciences busi...
12793,531aa7579154497382b2cdf76a44ecf8,Trainer/Career Coach,Professional Training,5a49d1772c1d489ba1aed7762c406218,5a49d1772c1d489ba1aed7762c406218,school summer job tech industry,summer job computer technology internships hum...


Let's save this final data frame! 

In [28]:
final_merged.to_csv('final_merged_data.csv')