In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer

from html.parser import HTMLParser

In [2]:
## data with the real text meat
prof = pd.read_csv("../professionals.csv")
answ = pd.read_csv("../answers.csv")
stud = pd.read_csv("../students.csv")
ques = pd.read_csv("../questions.csv")

In [3]:
# Professional's headlines - clean them!!!
def clean_names(headline): 
    if type(headline) == str:
        return re.sub(r'at\s.*$', '', headline)
    return headline

# Updated the city columns 
prof.professionals_headline = prof.professionals_headline.apply(clean_names)
prof.sample(2)

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
16472,84ec7c40c83f4204a73d24a10e9f23fa,Dallas/Fort Worth Area,Marketing and Advertising,"National Tax Pursuit Team Coach, PwC",2018-02-23 16:44:02 UTC+0000
4816,53039ff902a147349062dd46473fc462,"Houston, Texas",Information Technology and Services,Systems Engineer Manager,2016-03-21 19:52:30 UTC+0000


In [4]:
## Function to get rid of HTML tags from text
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(str(d))
    def get_data(self):
        return ''.join(self.fed)
    
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [5]:
# Clean up these bodies of text

# Tag parts of speech and remove stop words
# More interested in knowing which general area these questions are asking about so let's grab nouns
stop_words = set(stopwords.words("english"))
def filterText(text):
    # initialize stop words bank
    stop_words = set(stopwords.words("english"))
    
    # strip html! 
    text = strip_tags(str(text).lower())
    text = (nltk.word_tokenize(text))
    #print('Finished tokenizing body text')
    
    text = nltk.pos_tag(text)
    #print('Finished taging parts of speech')
    
    res = ""
    for i in text: 
        if (i[0] not in stop_words) and (i[1] == "NN" or i[1] == "NNP" or i[1] == "NNS" or i[1] == "NNPS"):
            res += i[0] + " "
    return res
print(ques.sample(3))
ques.questions_body = ques.questions_body.apply(filterText)


                           questions_id               questions_author_id  \
21343  f48bd954513c493585918206a3404671  2fc1a5fb58bd4591a146b66068bead45   
12772  e363d394cf3e4021890c041c5f204c44  79f8d41fe9c349b3b8e12349af2421a3   
13889  d826126a211a458594eb17261cde905d  fa3e9b3e183848799c59cae73e444204   

               questions_date_added  \
21343  2018-01-18 06:49:41 UTC+0000   
12772  2016-07-19 13:35:08 UTC+0000   
13889  2018-04-18 16:13:10 UTC+0000   

                                         questions_title  \
21343  How can a young entrepreneur be successful in ...   
12772            How do you write a decent cover letter?   
13889             How do I begin to network with people?   

                                          questions_body  
21343  My whole life I have always loved food. Not on...  
12772  I need to build my resume and make it complete...  
13889  I want to have people that I can branch out wi...  


In [6]:
# Let's do the same for question titles
ques.questions_title = ques.questions_title.apply(filterText)


In [7]:
print(ques.sample(2))
## Sweet! It works. Now we can apply some other tools to this cleaned text

                           questions_id               questions_author_id  \
12403  082412ea67844ebfa8baa357c39a4b6f  69e7e41d827a40aa880cb8f8fefc019d   
15755  cfcd952709b34f3f80811ca5e5886dbe  55fa00fbea4e47ebb9791e2ec7d0cfec   

               questions_date_added  \
12403  2018-03-19 12:10:27 UTC+0000   
15755  2015-03-07 22:11:40 UTC+0000   

                                         questions_title  \
12403                                   jobs humanities    
15755  certificates pursue title look computer suppor...   

                                          questions_body  
12403  humanities resources linguistics anthropology ...  
15755  computer engineering decision people computer ...  


Let's merge these data sets together a la reformat_careervillage_data.ipynb

In [8]:
prof_merged = pd.merge(prof, answ, left_on='professionals_id', right_on='answers_author_id', how='inner')
keep = ['professionals_id', 'professionals_headline','professionals_industry', 'answers_question_id']
prof_merged = prof_merged[keep]

stud_merged = pd.merge(stud, ques, left_on='students_id', right_on='questions_author_id')
keep2 = ['questions_id', 'questions_title', 'questions_body']
stud_merged = stud_merged[keep2]

final_merged = pd.merge(prof_merged, stud_merged, left_on='answers_question_id', right_on='questions_id')


In [12]:
final = final_merged.dropna(axis = 0)
final_merged.sample(5)
# Some interesting professional headlines that I never heard about answering questions in fields that I think are disjoint? I 

Unnamed: 0,professionals_id,professionals_headline,professionals_industry,answers_question_id,questions_id,questions_title,questions_body
6741,bc46e3699d92477ba8c7aa723e54a151,Principal Artist,Entertainment,7dfef2e977204580aac6aa7ceb2113b9,7dfef2e977204580aac6aa7ceb2113b9,fields job,people jobs everyone longs weekends profession...
45999,1833e8f9b5e34a84ad9998508712854b,NYU Graduate,Education,476bc5ed3a7640368c149a7093bfa10a,476bc5ed3a7640368c149a7093bfa10a,kids parents issues,masters program ms counseling health concern s...
4835,f635e282b31045819ac998a120fae0e1,Software Engineering Intern,Internet,fd51fc00cdb24cc882889fb6a58dd8d0,fd51fc00cdb24cc882889fb6a58dd8d0,piece career advice,question situation advie career advice share s...
38662,ff2723d4ec9c4a96b4a2df2bbca3db98,Retired Aerospace Engineer and Rocket Scientist,Defense and Space,44995e27fcf546da8eedf5fac6cf8f16,44995e27fcf546da8eedf5fac6cf8f16,degree,advantages disadvantages weather career advanc...
23907,403002609afa44e7ba625ed914593ac4,"Airline, Cargo & Logistics Engineering",Airlines/Aviation,6c45af55a3d14b76b15107ce0580bdc8,6c45af55a3d14b76b15107ce0580bdc8,air traffic street traffic,question careervillage administrator behalf st...


Let's save this final data frame! 

In [13]:
final_merged.to_csv('final_merged_data.csv', index = False)