# Topic Modelling for Financial Companies M&A Deal Ratonale's

In [4]:
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from stop_words import safe_get_stop_words
from stop_words import get_stop_words
from textblob import TextBlob

In [5]:
Financials = pd.read_excel("D:\\ISB\\Capstone Project\\Data\\2-Financial_Deals\\2-Financials - New.xls",sheetname='MM_DealUnformattedReport')#,skiprows=7)
for i in range(0,len(Financials)):
    try:
        Financials['Deal Description'][i] = Financials['Deal Description'][i].encode('utf8')
    except:
        print i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
stop_words1 = set(stopwords.words('english'))
stop_words1 = " ".join(i.encode('utf-8') for i in stop_words1).split()
stop_words2 = ['million','approximately','july','march','august','february','october','company','usbased','press','release'
              ,'llc','usd','inc']
stop_words3 = list(get_stop_words('en'))
stop_words3 = " ".join(i.encode('utf-8') for i in stop_words3).split()
stop_words4 = list(safe_get_stop_words('unsupported language'))
stop_words5 = str(Financials[['Target Company','Bidder Company']]) #,'Sellers','Target Ticker','Exchange:Ticker','Deal Responses','Target Company Industry']])
stop_words5 = list(pd.unique(stop_words5.split()))
stop_words = list(pd.unique(stop_words1+stop_words2+stop_words3+stop_words4+stop_words5))

stemming1 = WordNetLemmatizer()    #stemming2 = PorterStemmer()

In [9]:
unigram = []
bigram = []

for i in range(0,len(Financials)):
    to_lowercase = Financials['Deal Description'][i].lower()
    puncuation_removed = re.sub(r'[^\w\s]','',to_lowercase)
    newline_removed = re.sub('\n',' ',puncuation_removed)
    stopwords_removed = " ".join(j for j in newline_removed.split() if j not in stop_words)
    stemming_words1 = " ".join(stemming1.lemmatize(j) for j in stopwords_removed.split())
    numbers_removed = " ".join(re.sub("\d+","",i) for i in stemming_words1.split())
    letters_removed = " ".join(j for j in numbers_removed.split() if len(j) > 3)
    trimmed = letters_removed.strip()
    unigram.append(trimmed)
    
    a = ""
    for j in range(0,(len(trimmed.split())-1)):
        empty = trimmed.split()[j]+"_"+trimmed.split()[j+1]
        a = a+" "+empty
    a = a.strip()
    bigram.append(a)
    
data = [str(bigram[i])+" "+str(unigram[i]) for i in range(len(bigram))]

n_features = 1000

# tf or count matrix
# for topic modelling LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=8, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data)
tf_feature_names = tf_vectorizer.get_feature_names()

# tf-idf matrix
# for topic modelling NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=8, max_features=n_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

n_topics = 8 #input("No of topics:")

nmf = NMF(n_components=n_topics,random_state=1,alpha=.1,l1_ratio=.5,init='nndsvd')
nmf = nmf.fit(tfidf)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda = lda.fit(tf)

# print "The Shape of NMF Matrix:",nmf.components_.shape
# print "The Shape of LDA Matrix:",lda.components_.shape

def display_topics(model, feature_names, no_top_words):
    if str(model)[0:3] == "NMF":
        print "NMF MODEL"
    elif str(model)[0:3] == "Lat":
        print "LDA MODEL"
        
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx+1)
        print " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    print "="*100
    
no_top_words = 15

print "Topics For Financial Companies"
print "="*100

display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tfidf_feature_names, no_top_words)

Topics For Financial Companies
NMF MODEL
Topic 1:
bank community branch community_bank state_bank banking bancshares national_bank national state deposit approval transaction bank_bank bancorp
Topic 2:
capital partner private_equity private equity firm equity_firm stake undisclosed venture capital_partner partner_private acquired firm_acquired acquired_undisclosed
Topic 3:
brokerage benefit employee service risk international employee_benefit agency acquired undisclosed_consideration solution undisclosed marsh property casualty
Topic 4:
share bancorp price exhibit closing headquartered offer transaction price_share shareholder termination form filed share_price form_filed
Topic 5:
management investment asset_management advisor wealth asset wealth_management investment_management advisory client management_firm fund firm capital financial
Topic 6:
arthur gallagher arthur_gallagher brokerage risk claim administration_service settlement brokerage_risk third_party risk_management providing

In [11]:
topic_output_lda = np.column_stack((data,lda.transform(tf)))
topic_output_nmf = np.column_stack((data,nmf.transform(tfidf)))
df1 = pd.DataFrame(topic_output_lda,columns=['Deal Rationale','Topic1','Topic2','Topic3','Topic4','Topic5','Topic6','Topic7','Topic8'])
df2 = pd.DataFrame(topic_output_nmf,columns=['Deal Rationale','Topic1','Topic2','Topic3','Topic4','Topic5','Topic6','Topic7','Topic8'])
df1["LDA Topic"] = df1[df1.columns[1:9]].idxmax(axis=1)
df2["NMF Topic"] = df2[df2.columns[1:9]].idxmax(axis=1)

Financials["NMF TopicID"] = df2[df2.columns[1:9]].idxmax(axis=1)
Financials["LDA TopicID"] = df1[df1.columns[1:9]].idxmax(axis=1)

Financials.to_csv("D:\\ISB\\Capstone Project\\Data\\2-Financial_Deals\\Financials - Topics.csv",index=False,encoding='utf-8')

## Summary:

Topic modelling on health care acquisitions is done using the LDA and NMF methods.

Results for the the Deal rationale, for 8 topics of both the models are:

**Topics from Deal Description are:**

**NMF MODEL**

Topic 1: bank community branch community_bank state_bank banking bancshares national_bank national state deposit approval transaction bank_bank Bancorp --banking

Topic 2:capital partner private_equity private equity firm equity_firm stake undisclosed venture capital_partner partner_private acquired firm_acquired acquired_undisclosed  --Captilizations

Topic 3: brokerage benefit employee service risk international employee_benefit agency acquired undisclosed_consideration solution undisclosed marsh property casualty  --employees

Topic 4: share bancorp price exhibit closing headquartered offer transaction price_share shareholder termination form filed share_price form_filed  --Shares

Topic 5: management investment asset_management advisor wealth asset wealth_management investment_management advisory client management_firm fund firm capital financial  --Managerials Aspects

Topic 6: arthur gallagher arthur_gallagher brokerage risk claim administration_service settlement brokerage_risk third_party risk_management providing_brokerage head administration party  --third party and risk

Topic 7: financial corporation service financial_service financial_corporation acquisition business corp holding form transaction filed form_filed expected listed  --Financial related

Topic 8: equipment rental leasing sale engaged platinum acquired providing industrial engaged_providing solution event business family consideration –deal type

**LDA MODEL**

Topic 1: bank transaction acquisition expected holding headquartered banking approval financial service asset acquire consideration condition branch

Topic 2: share bank transaction price bancorp exhibit closing form filed form_filed shareholder financial headquartered offer approval

Topic 3: heritage pacific rental green interstate equipment united class card share year corporation revenue plan transaction

Topic 4: bancorp patriot bank liberty transaction heartland berkshire fidelity hill national energy lake midwest mountain howard

Topic 5: transaction service bank expected common asset benefit agreement consideration equity shareholder financial closing acquire acquisition

Topic 6: management service capital investment transaction acquisition consideration financial firm undisclosed acquired source partner link source_link

Topic 7: home virginia atlantic banc eastern martin current california member vice board chairman vice_president officer executive_vice

Topic 8: citizen general brown first_citizen national_general citizen_bank holding_corp national guaranty bank_trust health north corp trust liability