In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize,word_tokenize
from gensim import corpora,models

In [2]:
df = pd.read_csv('complaints.csv')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-09-24,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,transworld systems inc. \nis trying to collect...,,TRANSWORLD SYSTEMS INC,FL,335XX,,Consent provided,Web,2019-09-24,Closed with explanation,Yes,,3384392
1,2019-09-19,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,PA,15206,,Consent not provided,Web,2019-09-20,Closed with non-monetary relief,Yes,,3379500
2,2019-10-25,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I would like to request the suppression of the...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",CA,937XX,,Consent provided,Web,2019-10-25,Closed with explanation,Yes,,3417821
3,2019-11-08,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,"Over the past 2 weeks, I have been receiving e...",,"Diversified Consultants, Inc.",NC,275XX,,Consent provided,Web,2019-11-08,Closed with explanation,Yes,,3433198
4,2019-02-08,Vehicle loan or lease,Lease,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,,,HYUNDAI CAPITAL AMERICA,FL,33161,,Consent not provided,Web,2019-02-08,Closed with non-monetary relief,Yes,,3146310


In [3]:
df = df[['Issue','Consumer complaint narrative']]
df = df[df['Consumer complaint narrative'].notna()]
df = df.reset_index(drop=True)

In [4]:
min_complaint_length = 8
summary_length = 2
numTopics = 5

In [5]:
df['tokenized_sent'] = df['Consumer complaint narrative'].apply(lambda x: sent_tokenize(x))
df['complaint_length'] = df['tokenized_sent'].apply(lambda x: len(x))
df = df[df['complaint_length']>= min_complaint_length]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20


In [6]:
df1 = df.head(1000)

In [7]:
def tokenization(sentences_list):
    
    words_list = [None]*len(sentences_list)
    for i in range(len(sentences_list)):
        words_list[i] = word_tokenize(sentences_list[i])
    
    return words_list

In [8]:
df1['words_of_sents'] = df1['tokenized_sent'].apply(lambda y: tokenization(y))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
df1['zipped_tokens'] = list(zip(df1.tokenized_sent, df1.words_of_sents))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...


In [10]:
def takeFirst(x):
    return x[0]

def takeSecond(x):
    return x[1]


In [11]:
def selTopSents(summSize, numTopics, sortedVecs):
    topSentences = []
    sentIndexes = set()
    sCount = 0
    for i in range(summSize):
        for j in range(numTopics):
            vecs = sortedVecs[j]
            si = vecs[i][0]
            if si not in sentIndexes:
                topSentences.append(vecs[i])
                sentIndexes.add(si)
                sCount += 1
            if sCount == summSize:
                return topSentences

In [29]:
def lsi_summ(sentTokens,numTopics,sents):
    
    dct = corpora.Dictionary(sentTokens)
    corpus = list(map(lambda st: dct.doc2bow(st), sentTokens))    
    lsi = models.LsiModel(corpus, id2word=dct,num_topics=numTopics)
    
    vecCorpus = lsi[corpus]
    
    sortedVecs = list(map(lambda i: list(), range(numTopics)))
    for i,dv in enumerate(vecCorpus):
        for sc in dv:
            isc = (i, abs(sc[1]))
            sortedVecs[sc[0]].append(isc)
    sortedVecs = list(map(lambda iscl: sorted(iscl,key=takeSecond,reverse=True), sortedVecs))
    
    top_sents = selTopSents(summary_length,numTopics,sortedVecs)
    top_sents = sorted(top_sents,key=takeFirst)
    top_sentences = list(map(lambda ts: (sents[ts[0]], ts[1]), top_sents)) 
    
   
    return top_sentences
    

In [13]:
df1['lsi_model'] = df1['zipped_tokens'].apply(lambda x: lsi_summ(x[1],numTopics,x[0]))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1...."
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1...."
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1...."
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0..."
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1...."


In [14]:
for i in range(5):
    print(df1['lsi_model'][i])

LsiModel(num_terms=804, num_topics=5, decay=1.0, chunksize=20000)
LsiModel(num_terms=479, num_topics=5, decay=1.0, chunksize=20000)
LsiModel(num_terms=166, num_topics=5, decay=1.0, chunksize=20000)
LsiModel(num_terms=78, num_topics=5, decay=1.0, chunksize=20000)
LsiModel(num_terms=134, num_topics=5, decay=1.0, chunksize=20000)


In [15]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation,strip_numeric

lda_topics = df1['lsi_model'][3].show_topics(num_words=5)

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    print(topic)
    topics.append(preprocess_string(topic[1], filters))

print(topics)

(0, '0.792*"XXXX" + 0.316*"from" + 0.316*"XX/XX/XXXX" + 0.316*"Inquiry" + 0.128*"Auto"')
(1, '0.355*"REPORT" + 0.316*"." + 0.294*"INQUIRES" + 0.259*"AND" + 0.226*"MY"')
(2, '-0.346*"Auto" + -0.346*"Financing" + -0.288*"," + -0.231*"Automobile" + -0.231*"Dealers"')
(3, '-0.333*"SECTION" + 0.208*"I" + 0.208*"THESE" + 0.208*"TO" + 0.194*"NOT"')
(4, '0.439*"Telephone" + 0.439*"Companies" + 0.327*"XXXX" + -0.225*"Miscellaneous" + -0.225*"Reptg"')
[['xxxx', 'from', 'xx', 'xx', 'xxxx', 'inquiry', 'auto'], ['report', 'inquires', 'and', 'my'], ['auto', 'financing', 'automobile', 'dealers'], ['section', 'i', 'these', 'to', 'not'], ['telephone', 'companies', 'xxxx', 'miscellaneous', 'reptg']]


In [17]:
df1['vectors_of_corpus'] = df1['zipped_tokens'].apply(lambda x: lsi_summ(x[1],numTopics,x[0]))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model,vectors_of_corpus
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1....","([(0, 1.7521104232567304), (1, 0.6774808986454..."
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1....","([(0, 3.0896344186887204), (1, -0.832495707337..."
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1....","([(0, 3.6235095759777747), (1, 2.1906868245943..."
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0...","([(0, 0.04062560207907519), (1, 3.839272450725..."
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1....","([(0, 3.045455645730594), (1, 0.44237709223169..."


In [18]:
for i,dv in enumerate(df1['vectors_of_corpus'][3]):
    print(i)
    print(dv)

0
[(0, 0.04062560207907519), (1, 3.839272450725735), (2, 1.1192206164713605), (3, 3.8555334985084855), (4, 0.006973739730169864)]
1
[(0, 0.04020846169631995), (1, 1.8925355886297694), (2, -0.12187005051895), (3, -1.2006110807285475), (4, -0.3328156353453374)]
2
[(0, 0.04081262003362202), (1, 4.453899013218976), (2, 0.8628117772572789), (3, -1.89790152990596), (4, 0.8470790507943409)]
3
[(0, 0.040457700992923645), (1, 2.791273383511698), (2, 0.12299791917629942), (3, -1.7248831419606911), (4, -0.2768061649294941)]
4
[(0, 9.523787287950821), (1, 1.4903004072490058), (2, -3.702605361589173), (3, 0.31705910423543), (4, -1.4131357216107334)]
5
[(0, 3.444361945128859), (1, 0.4625939097035576), (2, -1.1723039149657601), (3, 0.10899039762091363), (4, -0.08423214114213537)]
6
[(0, 1.860234445385089), (1, 0.5211607087145169), (2, -1.1948153182520223), (3, 0.09096472396439387), (4, -0.739203239194721)]
7
[(0, 41.79864247209589), (1, -0.46109759034808834), (2, 1.5708677137204567), (3, -0.177680977

In [19]:
def vectors_of_sentences(x,numTopics):
    vectors = [None]*len(x)
    for i,dv in enumerate(x):
        array = [None]*numTopics
        for sc in dv:
            array[sc[0]] = sc[1]
        vectors[i] = array
        
    return vectors    
    
    

In [20]:
df1['sentence_vectors'] = df1['vectors_of_corpus'].apply(lambda x: vectors_of_sentences(x,numTopics))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model,vectors_of_corpus,sentence_vectors
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1....","([(0, 1.7521104232567304), (1, 0.6774808986454...","[[1.7521104232567304, 0.6774808986454266, -1.5..."
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1....","([(0, 3.0896344186887204), (1, -0.832495707337...","[[3.0896344186887204, -0.8324957073375346, 1.4..."
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1....","([(0, 3.6235095759777747), (1, 2.1906868245943...","[[3.6235095759777747, 2.1906868245943043, 0.63..."
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0...","([(0, 0.04062560207907519), (1, 3.839272450725...","[[0.04062560207907519, 3.839272450725735, 1.11..."
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1....","([(0, 3.045455645730594), (1, 0.44237709223169...","[[3.045455645730594, 0.4423770922316906, 3.223..."


In [22]:
for i in (df1['sentence_vectors'][3]):
    print(i)

[0.04062560207907519, 3.839272450725735, 1.1192206164713605, 3.8555334985084855, 0.006973739730169864]
[0.04020846169631995, 1.8925355886297694, -0.12187005051895, -1.2006110807285475, -0.3328156353453374]
[0.04081262003362202, 4.453899013218976, 0.8628117772572789, -1.89790152990596, 0.8470790507943409]
[0.040457700992923645, 2.791273383511698, 0.12299791917629942, -1.7248831419606911, -0.2768061649294941]
[9.523787287950821, 1.4903004072490058, -3.702605361589173, 0.31705910423543, -1.4131357216107334]
[3.444361945128859, 0.4625939097035576, -1.1723039149657601, 0.10899039762091363, -0.08423214114213537]
[1.860234445385089, 0.5211607087145169, -1.1948153182520223, 0.09096472396439387, -0.739203239194721]
[41.79864247209589, -0.46109759034808834, 1.5708677137204567, -0.17768097773406644, -0.3976678699973022]
[1.860234445385089, 0.5211607087145169, -1.1948153182520223, 0.09096472396439387, -0.739203239194721]
[10.04469462060302, 0.10163312418823361, -2.1897090259414638, 0.3715663868989

In [24]:
df1['sorted_sentence_vectors'] = df1['zipped_tokens'].apply(lambda x: lsi_summ(x[1],numTopics,x[0]))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model,vectors_of_corpus,sentence_vectors,sorted_sentence_vectors
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1....","([(0, 1.7521104232567304), (1, 0.6774808986454...","[[1.7521104232567304, 0.6774808986454266, -1.5...","[[(94, 9.018659269268857), (18, 8.165966504334..."
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1....","([(0, 3.0896344186887204), (1, -0.832495707337...","[[3.0896344186887204, -0.8324957073375346, 1.4...","[[(3, 17.147567041633348), (29, 14.96624425857..."
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1....","([(0, 3.6235095759777747), (1, 2.1906868245943...","[[3.6235095759777747, 2.1906868245943043, 0.63...","[[(2, 6.350430257023428), (3, 4.30889606595348..."
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0...","([(0, 0.04062560207907519), (1, 3.839272450725...","[[0.04062560207907519, 3.839272450725735, 1.11...","[[(7, 41.798642472095864), (9, 10.044694620603..."
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1....","([(0, 3.045455645730594), (1, 0.44237709223169...","[[3.045455645730594, 0.4423770922316906, 3.223...","[[(5, 4.573425214635741), (3, 4.32501842202992..."


In [25]:
for i in (df1['sorted_sentence_vectors'][3]):
    print(i)

[(7, 41.798642472095864), (9, 10.044694620603016), (4, 9.523787287950816), (5, 3.444361945128857), (6, 1.860234445385088), (8, 1.860234445385088), (2, 0.04081262003362974), (0, 0.04062560207908364), (3, 0.040457700992926525), (1, 0.04020846169632388)]
[(2, 4.453899013218975), (0, 3.839272450725746), (3, 2.7912733835116974), (1, 1.8925355886297686), (4, 1.4903004072489825), (6, 0.5211607087145119), (8, 0.5211607087145119), (5, 0.46259390970354947), (7, 0.4610975903481703), (9, 0.1016331241882101)]
[(4, 3.702605361589182), (9, 2.1897090259414718), (7, 1.5708677137204639), (6, 1.1948153182520242), (8, 1.1948153182520242), (5, 1.1723039149657626), (0, 1.1192206164712184), (2, 0.8628117772573414), (3, 0.12299791917635862), (1, 0.12187005051890634)]
[(0, 3.8555334985085197), (2, 1.8979015299059294), (3, 1.7248831419606905), (1, 1.2006110807285546), (9, 0.37156638689887245), (4, 0.31705910423529604), (7, 0.17768097773400993), (5, 0.10899039762087034), (6, 0.09096472396435061), (8, 0.090964723

In [27]:
df1['top_sorted_sentence_vectors'] = df1['zipped_tokens'].apply(lambda x: lsi_summ(x[1],numTopics,x[0]))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model,vectors_of_corpus,sentence_vectors,sorted_sentence_vectors,top_sorted_sentence_vectors
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1....","([(0, 1.7521104232567304), (1, 0.6774808986454...","[[1.7521104232567304, 0.6774808986454266, -1.5...","[[(94, 9.018659269268857), (18, 8.165966504334...","[(8, 3.9015025021205445), (94, 9.0186591935986..."
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1....","([(0, 3.0896344186887204), (1, -0.832495707337...","[[3.0896344186887204, -0.8324957073375346, 1.4...","[[(3, 17.147567041633348), (29, 14.96624425857...","[(3, 17.14756704163335), (29, 7.692560168441424)]"
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1....","([(0, 3.6235095759777747), (1, 2.1906868245943...","[[3.6235095759777747, 2.1906868245943043, 0.63...","[[(2, 6.350430257023428), (3, 4.30889606595348...","[(2, 6.350430257023435), (3, 3.5337089376305166)]"
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0...","([(0, 0.04062560207907519), (1, 3.839272450725...","[[0.04062560207907519, 3.839272450725735, 1.11...","[[(7, 41.798642472095864), (9, 10.044694620603...","[(2, 4.453899013218978), (7, 41.7986424720959)]"
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1....","([(0, 3.045455645730594), (1, 0.44237709223169...","[[3.045455645730594, 0.4423770922316906, 3.223...","[[(5, 4.573425214635741), (3, 4.32501842202992...","[(5, 4.573425214635741), (10, 3.32626632978783..."


In [28]:
df1['top_sorted_sentence_vectors'][3]

[(2, 4.453899013218978), (7, 41.7986424720959)]

In [30]:
df1['summary'] = df1['zipped_tokens'].apply(lambda x: lsi_summ(x[1],numTopics,x[0]))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Issue,Consumer complaint narrative,tokenized_sent,complaint_length,words_of_sents,zipped_tokens,lsi_model,vectors_of_corpus,sentence_vectors,sorted_sentence_vectors,top_sorted_sentence_vectors,summary
0,Closing on a mortgage,We have already tried to contact the company w...,[We have already tried to contact the company ...,177,"[[We, have, already, tried, to, contact, the, ...",([We have already tried to contact the company...,"LsiModel(num_terms=804, num_topics=5, decay=1....","([(0, 1.7521104232567304), (1, 0.6774808986454...","[[1.7521104232567304, 0.6774808986454266, -1.5...","[[(94, 9.018659269268857), (18, 8.165966504334...","[(8, 3.9015025021205445), (94, 9.0186591935986...",[(We have been through multiple account office...
1,Incorrect information on your report,Accounts added and/or created unbeknownst to m...,[Accounts added and/or created unbeknownst to ...,30,"[[Accounts, added, and/or, created, unbeknowns...",([Accounts added and/or created unbeknownst to...,"LsiModel(num_terms=479, num_topics=5, decay=1....","([(0, 3.0896344186887204), (1, -0.832495707337...","[[3.0896344186887204, -0.8324957073375346, 1.4...","[[(3, 17.147567041633348), (29, 14.96624425857...","[(3, 17.14756704163335), (29, 7.692560168441424)]",[(You MUST DELETE this unjust injurious allega...
2,Trouble during payment process,I have been trying to get my Private Mortgage ...,[I have been trying to get my Private Mortgage...,14,"[[I, have, been, trying, to, get, my, Private,...",([I have been trying to get my Private Mortgag...,"LsiModel(num_terms=166, num_topics=5, decay=1....","([(0, 3.6235095759777747), (1, 2.1906868245943...","[[3.6235095759777747, 2.1906868245943043, 0.63...","[[(2, 6.350430257023428), (3, 4.30889606595348...","[(2, 6.350430257023435), (3, 3.5337089376305166)]",[(I reached out to Ditech via a email ( after ...
3,Improper use of your report,2ND NOTICE OF PENDING LITIGATION SEEKING RELIE...,[2ND NOTICE OF PENDING LITIGATION SEEKING RELI...,10,"[[2ND, NOTICE, OF, PENDING, LITIGATION, SEEKIN...",([2ND NOTICE OF PENDING LITIGATION SEEKING REL...,"LsiModel(num_terms=78, num_topics=5, decay=1.0...","([(0, 0.04062560207907519), (1, 3.839272450725...","[[0.04062560207907519, 3.839272450725735, 1.11...","[[(7, 41.798642472095864), (9, 10.044694620603...","[(2, 4.453899013218978), (7, 41.7986424720959)]",[(I HAVE SUBMITTED A POLICE REPORT SEVERAL TIM...
4,Trouble during payment process,The mortgage company ( Roundpoint Mortgageg ) ...,[The mortgage company ( Roundpoint Mortgageg )...,20,"[[The, mortgage, company, (, Roundpoint, Mortg...",([The mortgage company ( Roundpoint Mortgageg ...,"LsiModel(num_terms=134, num_topics=5, decay=1....","([(0, 3.045455645730594), (1, 0.44237709223169...","[[3.045455645730594, 0.4423770922316906, 3.223...","[[(5, 4.573425214635741), (3, 4.32501842202992...","[(5, 4.573425214635741), (10, 3.32626632978783...",[(I have called numerous times and explained t...


In [31]:
df1['Consumer complaint narrative'][2]

'I have been trying to get my Private Mortgage Insurance Removed from my mortgage since XX/XX/XXXX when my mortgage dropped below 80 % loan to value. Last year my mortgage was sold from XXXX XXXX  ( Under mortgage # XXXX ) to Ditech Mortgage ( account # XXXX ). I reached out to Ditech via a email ( after being told to do so via phone representative ) request to remove my PMI on mortgage on XX/XX/XXXX and received no response at all from them, I even checked my junk box and nothing was there. My mortgage papers that I signed state an " Automatic Termination of PMI \'\' that states once my loan is below 78 % loan to value PMI will automatically terminate ( I have attached this document ). I reached out again today on XX/XX/XXXX to make this request via phone and was told initially to send the request that I already sent it too. I asked to speak with a supervisor and after being put on hold for about 30 minutes, I finally spoke to one. They told me that my loan to value must be under 70 %

In [33]:
df1['summary'][3]

[('I HAVE SUBMITTED A POLICE REPORT SEVERAL TIMES AND EXPERIAN HAS NOT OR REFUSED TO DELETE AND REMOVE THESE INQUIRES FROM MY CREDIT REPORT.',
  4.453899013218973),
 ('Agencies XXXX XXXX Inquiry from XX/XX/XXXX Services, Not Elsewhere XXXX XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX  XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX XXXX XXXX Inquiry from XX/XX/XXXX Automobile Dealers, New XXXX XXXX XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX XXXX XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX XXXX XXXX Inquiry from XX/XX/XXXX Automobile Dealers, New XXXX XXXX XXXX  Inquiry from XX/XX/XXXX Credit Unions XXXX XXXX Inquiry from XX/XX/XXXX Auto Financing XXXX XXXX XXXX Inquiry from XX/XX/XXXX Automobile Dealers, New XXXX XXXX XXXX Inquiry from XX/XX/XXXX Automobile Dealers, New XXXX  XXXX XXXX Inquiry from XX/XX/XXXX Miscellaneous Reptg.',
  41.79864247209589)]