In [47]:
import gensim, logging
import re
import csv, sys
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
import shutil
import operator
import json
import gensim, logging
import pandas as pd
from textblob import Word
from textblob.wordnet import VERB
from textblob.taggers import NLTKTagger
import os
from nltk.corpus import wordnet

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import datetime


In [2]:
#Load all the joined data and setup environment
allData = pd.read_csv("webmd-all-join.csv")
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '-','!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
tokenizer = RegexpTokenizer(r'\w+')
nltk_tagger = NLTKTagger()

In [52]:
wordLimit = 5
word2VecLimit = 5
modelMinCount = 23
modelDimensions = 300
modelName = "MODEL_" + str(modelMinCount) + "_" + str(modelDimensions)
path = "C:\\Users\\Aarav\\"
verbose = True

In [4]:
#Tf-IDF calculation fucntions
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)


In [5]:
#Convert treebank tag to WordNet tag
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [6]:
#Return POS tags of the sentence words
def POStags(sentence):
    blob = tb(sentence, pos_tagger=nltk_tagger)
    return blob.pos_tags

In [7]:
# Return a sentence with each word converted to its root form
def lemmatize(sentence):
    sen = []
    for token in POStags(sentence):
        word = Word(token[0])
        try:
            word = word.lemmatize(get_wordnet_pos(token[1]))
        except:
            pass
        sen.append(word)
    return sen      

In [8]:
def nameAndMeaning(word):
    print Word(word).definitions, "\n"
    print model.most_similar(positive = [word])

In [9]:
#Get K most similar words for a input word using word2vec model trained on the answer data
def modelWords(word, topk):
    words =  model.most_similar(positive = [word], topn=topk)
    wordKeys = []
    for i in range(len(words)):
        wordKeys.append(words[i][0])
    return wordKeys

In [10]:
#Return WordNet meaning of words
def getWordNetMeanings(key):
    return Word(key).definitions

In [11]:
#Return a sorted list  
def getTopics():
    df = allData[['topicname', 'answercontent']]
    topics = df.topicname.unique()
    topics.sort()
    return topics


In [12]:
#Check if a quantity is NaN
def isNaN(num):
    return num != num

In [13]:
def trainModel(modelName, minCount, dimensions):
    sentences = []
    f = open(path+"dataAnswers.txt")
    reader = csv.reader(f)  
    for row in reader:  
        data = ""
        for item in row:
            data = data + item
        #print data
        raw = data.lower()
        tokens = tokenizer.tokenize(raw)
        en_stop = get_stop_words('en')
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        raw = " ".join(stopped_tokens)
        raw = lemmatize(raw)
        #raw = " ".join(raw)
        sentences.append(raw)
        
    model = gensim.models.Word2Vec(sentences, min_count=minCount, size=dimensions)
    model.save(modelName)

In [14]:
#tf-idf over all topics 
def globalTfIdf(verbose, wordLimit):
    bloblist = []
    topics = getTopics()
    df = allData[['topicname', 'answercontent']]
    
    for topic in topics:
        if  isNaN(topic) is False:
            d = df.loc[df['topicname'] == topic]
            nonEmpty = d['answercontent'] != ' '
            d = d[nonEmpty]
            corpus =  d['answercontent'].tolist()

            total = ""
            raw = ""
            
            for sentence in corpus:
                
                raw = sentence.lower()
                tokens = lemmatize(raw)
                en_stop = get_stop_words('en')
                stopped_tokens = [i for i in tokens if not i in en_stop]
                raw = " ".join(stopped_tokens)
                total = total + raw
                
            bloblist.append(tb(total))

    globalDict = []
    for i, blob in enumerate(bloblist):
        #print("Top words in document {}".format(i + 1))
        if verbose:
            print "Top words in document ", topics[i]
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        wordsData = {}
        dict = {}
        for word, score in sorted_words[:wordLimit]:
            #print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
            wordsData[word] = round(score,5)
        dict["topic"] = topics[i]
        dict["words"] = wordsData

        globalDict.append(dict)

    try:
        os.remove('tf_idf.json')
    except OSError:
        pass
    with open('tf_idf.json', 'w') as outfile:
        json.dump(globalDict, outfile)

In [15]:
def word2VecTopK(topK):
    with open('tf_idf.json') as infile:
        array = json.load(infile)

    dict = []
    for i in range(len(array)):

        data = {}
        wordlist = []
        wordNetMeaningList = []
        wordNetMeaningDict = {}
        for key, value in array[i]['words'].iteritems():
            word2vecWords = []
            try:
                wordNetMeaningDict[key] = getWordNetMeanings(key)
                word2vecWords = modelWords(key, topK)
            except:
                pass

            for word in word2vecWords:
                if(word not in wordlist):
                    wordlist.append(word)

        wordNetMeaningList.append(wordNetMeaningDict)       
        data['topic'] = array[i]['topic']
        data['word2VecList'] = wordlist
        data['wordNetMeanings'] = wordNetMeaningList
        dict.append(data)

    try:
        os.remove('word2vecAndTfidf.json')
    except OSError:
        pass
    with open('word2vecAndTfidf.json', 'w') as outfile:
        json.dump(dict, outfile)

In [16]:
#Topics for each member to create member graph edges by interesection 
def generateMemberList():
    globalList = []
    information = allData[['questiontopicid','membername']]
    members = allData['membername'].unique().tolist()
    for member in members:
        if isNaN(member) is False:
            data = {}
            d = information.loc[information['membername'] == member]
            topicListForUser = []
            for question in d['questiontopicid'].unique().tolist():
                if isNaN(question) is False:
                    categories = question.split(',')
                    for category in categories:
                        topic =  category.split('-questions')[0].strip()
                        if topic not in topicListForUser:
                            topicListForUser.append(topic)
            member =  member.split(",")[0]
            data['member'] = member
            data['topics'] = topicListForUser
            if topicListForUser is []:
                print member
            globalList.append(data)
    try:
        os.remove('topicListForUser.json')
    except OSError:
        pass
    with open('topicListForUser.json', 'w') as outfile:
        json.dump(globalList, outfile)

In [17]:
#get helpfulvotes of members in each topic
def generateTopicMemberList():
    globalDict = {}
    df = allData[['topicname', 'membername','memberhelpfulvotes']]
    
    topics = getTopics()
    
    for topic in topics:
        if isNaN(topic) is False:
            data = {}
            d = df.loc[df['topicname'] == topic]
            members = d['membername'].unique().tolist() 
            memberList = []
            memberVotes = []
            for member in members:
                if isNaN(member) is False:
                    if member not in memberList:
                        d = df.loc[df['membername'] == member]
                        memberVotes.append(d['memberhelpfulvotes'].unique().tolist()[0])
                        member =  member.split(",")[0]
                        memberList.append(member)
            data['members'] = memberList
            data['memberhelpfulvotes'] = memberVotes
            globalDict[topic] = data
    
        try:
            os.remove('TopicMemberList.json')
        except OSError:
            pass
        with open('TopicMemberList.json', 'w') as outfile:
            json.dump(globalDict, outfile)

In [18]:
#Get location information for each  user
def generateMemberLocation():
    memLocation = pd.read_csv("member_location.csv")
    members = memLocation.membername.unique().tolist()
    globalData = []
    for member in members:
        if isNaN(member) is False:
            data = {}
            d = memLocation.loc[memLocation['membername']==member]
            data['member'] = member.split(",")[0]
            data['location'] = d.location.unique().tolist()[0]
            globalData.append(data)

        try:
            os.remove('memberLocation.json')
        except OSError:
            pass
        with open('memberLocation.json', 'w') as outfile:
            json.dump(globalData, outfile)


In [19]:
#Restrict to 10 members per topics
def sortTop10TopicMemberList():
    inputdata = json.loads(open("TopicMemberList.json").read())
    globalDict = {}
    for topic, values in inputdata.iteritems():
        data = {}
        memberhelpfulvotes = []
        members = []
        sortedMembers = []
        for keyNext, value in values.iteritems():
            if keyNext == 'memberhelpfulvotes':
                memberhelpfulvotes = value
            if keyNext == 'members':
                members = value
        sortedVotes =  sorted(memberhelpfulvotes, reverse = True)
        for item in sortedVotes:
            sortedMembers.append(members[memberhelpfulvotes.index(item)])

        data['members'] = sortedMembers[ :10]
        data['memberhelpfulvotes'] = sortedVotes[:10]
        globalDict[topic] = data

    try:
        os.remove('TopicMemberListTop10.json')
    except OSError:
        pass
    with open('TopicMemberListTop10.json', 'w') as outfile:
        json.dump(globalDict, outfile)

In [20]:
#Get state mapping for each topic by getting counts of members state info
def getStateForTopic():
    
    inputMemberLocation = json.loads(open("memberLocation.json").read())
    inputTopics = json.loads(open("TopicMemberListTop10.json").read())
    
    globalTopicData = {}
    for topic, values in inputTopics.iteritems():
        for keyNext, value in values.iteritems():
            data = {}
            memberhelpfulvotes = []
            members = []
            if keyNext == 'memberhelpfulvotes':
                memberhelpfulvotes = value
            if keyNext == 'members':
                members = value 

            for item in inputMemberLocation:
                state = item['location']
                if item['member'] in members:
                    if state in data:
                        data[state] = data[state] +1
                    else:
                        data[state] = 1
            globalTopicData[topic] = data

    try:
        os.remove('stateForTopic.json')
    except OSError:
        pass
    with open('stateForTopic.json', 'w') as outfile:
        json.dump(globalTopicData, outfile)

In [21]:
#Get fraction of answers made by experts for each topic 
def getExpertAnswers():
    df = allData[['topicname', 'answercontent', 'membertype']]
    topics = df.topicname.unique()
    globalData = {}

    for topic in getTopics():
        if isNaN(topic) is False:
            data = {}
            d = df.loc[df['topicname'] == topic]
            nonEmpty = d['answercontent'] != ' '
            d = d[nonEmpty]
            data['totalAnswers'] = len(d)
            dm = d.loc[d['membertype'] == 'Expert']
            data['expertAnswers'] = len(dm)
            globalData[topic] = data
    try:
        os.remove('expertAnswers.json')
    except OSError:
        pass
    with open('expertAnswers.json', 'w') as outfile:
        json.dump(globalData, outfile)
            

In [50]:
def getDatesYear():
    nodes = []

    for topic in getTopics():
        months = {}
        globalData = {}

        if isNaN(topic) is False:
            d = allData.loc[allData['topicname'] == topic]
            nonEmpty = d['answerpostdate'] != ' '
            d = d[nonEmpty]
            corpus =  d['answerpostdate'].tolist()
            for date in corpus:
                date_object = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                if date_object.month in months:
                    months[date_object.month] =  months[date_object.month] + 1
                else:
                     months[date_object.month] = 1 
            globalData[topic] = months
            nodes.append(globalData)
    try:
        os.remove('monthData.json')
    except OSError:
        pass
    with open('monthData.json', 'w') as outfile:
        json.dump(nodes, outfile)


In [48]:
def getDatesHour():
    nodes = []

    for topic in getTopics():
        hours = {}
        globalData = {}

        if isNaN(topic) is False:
            d = allData.loc[allData['topicname'] == topic]
            nonEmpty = d['answerpostdate'] != ' '
            d = d[nonEmpty]
            corpus =  d['answerpostdate'].tolist()

            for date in corpus:
                date_object = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                if date_object.hour in hours:
                    hours[date_object.hour] =  hours[date_object.hour] + 1
                else:
                     hours[date_object.hour] = 1 
            globalData[topic] = hours
            nodes.append(globalData)
    try:
        os.remove('hourData.json')
    except OSError:
        pass
    with open('hourData.json', 'w') as outfile:
        json.dump(nodes, outfile)

In [22]:
#Calculate TF-IDF
globalTfIdf(verbose, wordLimit)

Top words in document  nan
Top words in document  AIDS
Top words in document  ANA Test
Top words in document  Abdominal Obesity
Top words in document  Abdominal Pain
Top words in document  Ablation
Top words in document  Acetaminophen
Top words in document  Acne
Top words in document  Actinic Keratosis
Top words in document  Acupuncture
Top words in document  Addiction
Top words in document  Adrenal Gland
Top words in document  Adult Acne
Top words in document  Affordable Care Act
Top words in document  Aging
Top words in document  Alcohol Abuse
Top words in document  Alcoholic Beverage
Top words in document  Alcoholism
Top words in document  Allergic Conjunctivitis
Top words in document  Allergic Reaction
Top words in document  Allergy
Top words in document  Almond
Top words in document  Aloe
Top words in document  Alopecia
Top words in document  Alzheimer's Disease
Top words in document  Amino Acids
Top words in document  Ammonia
Top words in document  Amoxicillin
Top words in docume

In [None]:
#Train Word2vec model
trainModel(modelName, modelMinCount, modelDimensions)
model = gensim.models.Word2Vec.load(modelName)

In [27]:
word2VecTopK(word2VecLimit)

In [28]:
generateMemberList()

In [29]:
generateTopicMemberList()

In [30]:
generateMemberLocation()

In [31]:
sortTop10TopicMemberList()

In [32]:
getStateForTopic()

In [33]:
getExpertAnswers()

In [49]:
getDatesHour()

In [51]:
getDatesYear()