# Topic Modelling for Health Care M&A Deal Ratonale's

In [1]:
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from stop_words import safe_get_stop_words
from stop_words import get_stop_words
from textblob import TextBlob

In [3]:
health_care = pd.read_excel("D:\\ISB\\Capstone Project\\Data\\5-HealthCare_Deals\\5-Healthcare_20171016.xls",sheetname='MM_DealUnformattedReport')
for i in range(0,len(health_care)):
    try:
        health_care['Deal Description'][i] = health_care['Deal Description'][i].encode('utf8')
    except:
        print i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
stop_words1 = set(stopwords.words('english'))
stop_words1 = " ".join(i.encode('utf-8') for i in stop_words1).split()
stop_words2 = ['nan','llc','health','care','acquisition','expand','form','chbc','brands'
               ,'healthcare','usd','based','company','business','one','inc','usbased','providing'
               ,'hospital','us','based','us','press','release','source','link','corporation','filed','november'
               ,'transaction','undisclosed', 'consideration' , 'medical', 'center', 'headquartered']
stop_words3 = list(get_stop_words('en'))
stop_words3 = " ".join(i.encode('utf-8') for i in stop_words3).split()
stop_words4 = list(safe_get_stop_words('unsupported language'))
stop_words5 = str(health_care[['Target Company','Bidder Company']]) #,'Sellers','Target Ticker','Exchange:Ticker','Deal Responses','Target Company Industry']])
stop_words5 = list(pd.unique(stop_words5.split()))
stop_words = list(pd.unique(stop_words1+stop_words2+stop_words3+stop_words4+stop_words5))

stemming1 = WordNetLemmatizer()    #stemming2 = PorterStemmer()

In [6]:
## Primary Research
unigram = []
bigram = []

for i in range(0,len(health_care)):
    to_lowercase = health_care['Deal Description'][i].lower()
    puncuation_removed = re.sub(r'[^\w\s]','',to_lowercase)
    newline_removed = re.sub('\n',' ',puncuation_removed)
    stopwords_removed = " ".join(j for j in newline_removed.split() if j not in stop_words)
    stemming_words1 = " ".join(stemming1.lemmatize(j) for j in stopwords_removed.split())
    numbers_removed = " ".join(re.sub("\d+","",i) for i in stemming_words1.split())
    letters_removed = " ".join(j for j in numbers_removed.split() if len(j) > 3)
    trimmed = letters_removed.strip()
    unigram.append(trimmed)
    
    a = ""
    for j in range(0,(len(trimmed.split())-1)):
        empty = trimmed.split()[j]+"_"+trimmed.split()[j+1]
        a = a+" "+empty
    a = a.strip()
    bigram.append(a)
    
data = [str(bigram[i])+" "+str(unigram[i]) for i in range(len(bigram))]

n_features = 1000

# tf or count matrix
# for topic modelling LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=8, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data)
tf_feature_names = tf_vectorizer.get_feature_names()

# tf-idf matrix
# for topic modelling NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=8, max_features=n_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

n_topics = 8 #input("No of topics:")

nmf = NMF(n_components=n_topics,random_state=1,alpha=.1,l1_ratio=.5,init='nndsvd')
nmf = nmf.fit(tfidf)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda = lda.fit(tf)

# print "The Shape of NMF Matrix:",nmf.components_.shape
# print "The Shape of LDA Matrix:",lda.components_.shape

def display_topics(model, feature_names, no_top_words):
    if str(model)[0:3] == "NMF":
        print "NMF MODEL"
    elif str(model)[0:3] == "Lat":
        print "LDA MODEL"
        
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx+1)
        print " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    print "="*100
    
no_top_words = 15

print "Topics For Health Care Data"
print "="*100

display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tfidf_feature_names, no_top_words)

Topics For Health Care Data
NMF MODEL
Topic 1:
service provider home pharmacy management solution acquired group patient engaged service_acquired holding clinical provides emergency
Topic 2:
share offer price termination exhibit closing agreement expected shareholder condition director date board cash prior
Topic 3:
venture capital partner firm equity private_equity private equity_firm venture_capital capital_firm capital_partner firm_acquired partner_private investor management
Topic 4:
living senior_living senior capital_senior living_corp corp assisted living_community community assisted_living capital corp_listed unit home independent
Topic 5:
pharmaceutical product manufacturing development device engaged technology pharma market research manufacturer laboratory acquired manufacture industry
Topic 6:
hospital community regional community_system operator center surgical surgery facility tenet physician rehabilitation general acute university
Topic 7:
physical physical_therapy thera

In [7]:
topic_output_lda = np.column_stack((data,lda.transform(tf)))
topic_output_nmf = np.column_stack((data,nmf.transform(tfidf)))
df1 = pd.DataFrame(topic_output_lda,columns=['Deal Rationale','Topic1','Topic2','Topic3','Topic4','Topic5','Topic6','Topic7','Topic8'])
df2 = pd.DataFrame(topic_output_nmf,columns=['Deal Rationale','Topic1','Topic2','Topic3','Topic4','Topic5','Topic6','Topic7','Topic8'])
df1["LDA Topic"] = df1[df1.columns[1:9]].idxmax(axis=1)
df2["NMF Topic"] = df2[df2.columns[1:9]].idxmax(axis=1)

health_care["NMF TopicID"] = df2[df2.columns[1:9]].idxmax(axis=1)
health_care["LDA TopicID"] = df1[df1.columns[1:9]].idxmax(axis=1)

health_care.to_csv("D:\\ISB\\Capstone Project\\Data\\5-HealthCare_Deals\\HealthCare - Topics.csv",index=False,encoding='utf-8')

## Summary:

Topic modelling on health care acquisitions is done using the LDA and NMF methods.

Results for the the Deal rationale, for 8 topics of both the models are:

**Topics from Deal Description are:**
**NMF MODEL**

Topic 1:service provider home pharmacy management solution acquired group patient engaged service_acquired holding clinical provides emergency 

Topic 2:share offer price termination exhibit closing agreement expected shareholder condition director date board cash prior –deal financials related

Topic 3:venture capital partner firm equity private_equity private equity_firm venture_capital capital_firm capital_partner firm_acquired partner_private investr management

Topic 4:living senior_living senior capital_senior living_corp corp assisted living_community community assisted_living capital corp_listed unit home independent 

Topic 5:pharmaceutical product manufacturing development device engaged technology pharma market research manufacturer laboratory acquired manufacture industry –manufacturing and pharma industry

Topic 6:hospital community regional community_system operator center surgical surgery facility tenet physician rehabilitation general acute university  -

Topic 7:physical physical_therapy therapy rehabilitation clinic rehabilitation_service holding orthopedic therapy_service occupational acquired holding_engaged outpatient select operator –physical theraphy

Topic 8:anesthesia mednax anesthesia_service physician physician_service service practice pediatric service_including radiology expected_immediately group immediately_accretive immediately clinician –emergency services

**LDA MODEL**

Topic 1:service hospital acquire expected community agreed completed agreed_acquire update physician regional provider subject network condition

Topic 2:expected product exhibit cash share revenue listed condition agreement approximately update term closing acquire stock

Topic 3:share offer price closing termination prior shareholder date merger share_price premium closing_share board equity price_share

Topic 4:service home living acquired senior therapy senior_living rehabilitation physical provider community engaged capital facility hospice

Topic 5:venture pharmaceutical development laboratory capital firm therapeutic research partner venture_capital acquire diagnostics disease product stake

Topic 6:imaging product diagnostic supply equipment distribution distributor diagnostic_imaging radiology blood choice customer acquired technology international

Topic 7:service acquired partner capital equity private management firm provider private_equity engaged equity_firm solution group portfolio

Topic 8:surgical surgery affiliate ambulatory acquired surgical_affiliate device stake surgery_center ambulatory_surgery service life outpatient center physician