In [1]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contraction import CONTRACTION_MAP
import unicodedata

In [2]:
transcript = pd.read_csv('transcripts.csv')
ted = pd.read_csv('ted_main.csv')
ted_new = ted[['main_speaker','related_talks','tags','title','url']]
data = pd.merge(transcript,ted_new,on='url')

In [3]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [5]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [6]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    return text

In [7]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [8]:
transcript['transcript_clean'] = transcript.transcript.apply(remove_accented_chars)
transcript['transcript_clean'] = transcript.transcript_clean.apply(expand_contractions)


In [9]:
transcript['transcript_clean'] = transcript.transcript_clean.apply(remove_special_characters)

In [10]:
def remove_whitespace(x):
    try:
        # remove spaces inside and outside of string
        x = " ".join(x.split())

    except:
        pass
    return x

In [11]:
transcript['transcript_clean'] = transcript.transcript_clean.apply(remove_whitespace)

In [12]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')


In [13]:
from sklearn.feature_extraction import text
Text=transcript['transcript_clean'].tolist()

tfidf=text.TfidfVectorizer(input=Text,stop_words=stopword_list)

matrix=tfidf.fit_transform(Text)
print(matrix.shape)


(2467, 59850)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
similar =cosine_similarity(matrix)


In [15]:
def get_similar_articles(x):
    return ",".join(data['title'].loc[x.argsort()[-5:-1]])
data['similar_talks']=[get_similar_articles(x) for x in similar]


In [16]:
data.head(5)

Unnamed: 0,transcript,url,main_speaker,related_talks,tags,title,similar_talks
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...,Ken Robinson,"[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...","['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,"A one-man world summit,How to run a company wi..."
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,Al Gore,"[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...","['alternative energy', 'cars', 'climate change...",Averting the climate crisis,"Design and discovery,A one-man world summit,A ..."
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...,David Pogue,"[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...","['computers', 'entertainment', 'interface desi...",Simplicity sells,"A one-man world summit,Nerdcore comedy,Cool tr..."
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,Majora Carter,"[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...","['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,How students of color confront impostor syndro...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,Hans Rosling,"[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...","['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,"Religions and babies,The good news of the deca..."


In [17]:
data['similar_talks'][12]

"HIV and flu -- the vaccine strategy,Lessons from the 1918 flu,How we'll stop polio for good,The case for optimism"

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics,random_state=0)

topics = lda.fit_transform(matrix)
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(lda.components_):
    t_words[t_id] = [tfidf.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words



{0: ['wikipedia', 'not', 'images', 'page', 'notre'],
 1: ['mockingbird', 'mockingbirds', 'juanito', 'mimus', 'vente'],
 2: ['not', 'one', 'want', 'people', 'cannot'],
 3: ['molas', 'mola', 'not', 'laughter', 'think'],
 4: ['wk', 'kasungu', 'mala', 'kamkwamba', 'da'],
 5: ['clonie', 'eminem', 'spector', 'ie', 'huggable'],
 6: ['not', 'people', 'one', 'like', 'would'],
 7: ['not', 'like', 'one', 'laughter', 'people'],
 8: ['halter', 'vie', 'lui', 'mots', 'rachelle'],
 9: ['kiteflyers', 'hewerdine', 'not', 'music', 'would']}

In [19]:
from sklearn.decomposition import NMF

n_topics = 10
nmf = NMF(n_components=n_topics,random_state=0)

topics = nmf.fit_transform(matrix)
top_n_words = 5
t_words, word_strengths = {}, {}
for t_id, t in enumerate(nmf.components_):
    t_words[t_id] = [tfidf.get_feature_names()[i] for i in t.argsort()[:-top_n_words - 1:-1]]
    word_strengths[t_id] = t[t.argsort()[:-top_n_words - 1:-1]]
t_words

{0: ['not', 'laughter', 'know', 'people', 'like'],
 1: ['data', 'actually', 'computer', 'like', 'information'],
 2: ['music', 'applause', 'sound', 'guitar', 'play'],
 3: ['not', 'people', 'world', 'countries', 'africa'],
 4: ['cancer', 'cells', 'patients', 'disease', 'cell'],
 5: ['water', 'earth', 'planet', 'ocean', 'universe'],
 6: ['women', 'men', 'girls', 'not', 'woman'],
 7: ['city', 'cities', 'building', 'design', 'buildings'],
 8: ['brain', 'neurons', 'brains', 'cells', 'human'],
 9: ['kids', 'school', 'children', 'education', 'students']}

In [20]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('tfidf', tfidf),
    ('nmf', nmf)
])

document_id = 6
t = pipe.transform([transcript['transcript_clean'].iloc[document_id]]) 
print('Topic distribution for document #{}: \n'.format(document_id),t)
print('Relevant topics for document #{}: \n'.format(document_id),np.where(t>0.01)[1])
print('\nTranscript:\n',transcript['transcript_clean'].iloc[document_id][:500],'...')

talk = ted[ted['url']==transcript['url'].iloc[document_id]]

#talk = data[data['url'].iloc[document_id]]
print('\nTrue tags from ted_main.csv: \n',talk['tags'])

Topic distribution for document #6: 
 [[0.11322825 0.         0.         0.         0.         0.
  0.038167   0.         0.         0.02529564]]
Relevant topics for document #6: 
 [0 6 9]

Transcript:
 On September 10 the morning of my seventh birthday I came downstairs to the kitchen where my mother was washing the dishes and my father was reading the paper or something and I sort of presented myself to them in the doorway and they said Hey happy birthday And I said I am seven And my father smiled and said Well you know what that means do not you And I said Yeah that I am going to have a party and a cake and get a lot of presents And my dad said Well yes But more importantly being seven means ...

True tags from ted_main.csv: 
 6    ['Christianity', 'God', 'atheism', 'comedy', '...
Name: tags, dtype: object
