# TED Talks Natural Language Processing
A Clustering Based Method to Extract Topics and Deeper Insights from Ted Talk Transcripts

In [1]:
# Import libraries 
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import seaborn as sns
import warnings
import matplotlib.pyplot as plt

# Download stopwords
# nltk.download('stopwords')

# Configure display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 30
sns.set()
warnings.filterwarnings('ignore')
plt.ylim(0, 1)

# Configure run
%matplotlib inline

## Import and Clean Data

In [2]:
# Import data
ted_main = pd.read_csv('ted_main.csv', sep=',')
display(ted_main.head(2))

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520


In [3]:
ted_transcripts = pd.read_csv('transcripts.csv', sep=',')
display(ted_transcripts.head(3))

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...


In [4]:
ted = ted_main.merge(ted_transcripts, left_on='url', right_on='url', how='outer')
ted = ted.dropna()

del ted_main, ted_transcripts

display(ted.head(2))

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre..."


## Stemming, Stopwords, Tokens

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[0:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [7]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

## Helper Functions

In [8]:
def tokenize_stem(text): 
    # Tokenize by sentence, then by word
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # Filter any tokens not containing letters
    for token in tokens: 
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems 

def tokenize(text):
    # First tokenize by sentence, then by word 
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # Filter out any tokens not containing letters 
    for token in tokens: 
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [9]:
vocab_stemmed = []
vocab_tokenized = [] 

for title in ted['transcript']:
    words_stemmed = tokenize_stem(title)
    vocab_stemmed.extend(words_stemmed)
    
    words_tokenized = tokenize(title)
    vocab_tokenized.extend(words_tokenized)

In [10]:
vocab_df = pd.DataFrame({'words': vocab_tokenized}, index = vocab_stemmed)

## TF-IDF Matrix & Document Similarity

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define vectorizer params
tfidf_vectorizer = TfidfVectorizer(max_df = 0.80, max_features=200000,
                                  min_df=0.2, stop_words='english',
                                  use_idf=True, tokenizer=tokenize_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(ted['transcript'])

display(tfidf_matrix.shape)

(2461, 434)

In [12]:
terms = tfidf_vectorizer.get_feature_names()

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [45]:
# Clustering reccomendations

sim_df = pd.DataFrame(cosine_similarity(tfidf_matrix))
# display(sim_df[0].sort_values(ascending=False))

0       1.000000
663     0.734907
1418    0.698718
1409    0.606966
2229    0.597086
557     0.557700
274     0.528512
1339    0.522329
997     0.515045
1111    0.513217
1110    0.513217
21      0.513194
1962    0.512853
2027    0.512513
2253    0.507814
1567    0.501695
578     0.500815
712     0.500107
1363    0.499174
603     0.498560
792     0.498050
444     0.497452
1633    0.497140
14      0.496407
685     0.495646
1629    0.493363
416     0.490579
452     0.488832
6       0.486557
177     0.477922
364     0.475827
1416    0.473847
1827    0.471318
405     0.471195
96      0.470296
727     0.470078
639     0.468149
2356    0.467823
486     0.466097
1351    0.466046
1504    0.463509
880     0.462571
1359    0.462457
2       0.461748
1317    0.458879
2131    0.458441
245     0.457880
309     0.457116
2259    0.456875
1737    0.455180
1880    0.450330
702     0.446560
2186    0.444283
1105    0.442303
1370    0.441960
95      0.440787
302     0.440465
745     0.440438
1330    0.4402

In [53]:
display(ted.iloc[[0]])
display(ted.iloc[[663]])
display(ted.iloc[[1418]])
display(ted.iloc[[1409]])


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
692,1234,"In this poignant, funny follow-up to his fable...",1008,TED2010,1265414400,57,Ken Robinson,Ken Robinson: Bring on the learning revolution!,1,1274691960,"[{'id': 7, 'name': 'Funny', 'count': 3000}, {'...","[{'id': 66, 'hero': 'https://pe.tedcdn.com/ima...",Author/educator,"['TED Brain Trust', 'children', 'creativity', ...",Bring on the learning revolution!,https://www.ted.com/talks/sir_ken_robinson_bri...,7266316,"I was here four years ago, and I remember, at ..."


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
1505,634,Sir Ken Robinson outlines 3 principles crucial...,1151,TED Talks Education,1366675200,43,Ken Robinson,Ken Robinson: How to escape education's death ...,1,1368198532,"[{'id': 21, 'name': 'Unconvincing', 'count': 2...","[{'id': 66, 'hero': 'https://pe.tedcdn.com/ima...",Author/educator,"['culture', 'education', 'student', 'teaching'...",How to escape education's death valley,https://www.ted.com/talks/ken_robinson_how_to_...,6657858,Thank you very much.I moved to America 12 year...


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
1496,632,"Rita Pierson, a teacher for 40 years, once hea...",468,TED Talks Education,1367884800,46,Rita Pierson,Rita Pierson: Every kid needs a champion,1,1367589737,"[{'id': 10, 'name': 'Inspiring', 'count': 5946...","[{'id': 66, 'hero': 'https://pe.tedcdn.com/ima...",Educator,"['children', 'education', 'motivation', 'teach...",Every kid needs a champion,https://www.ted.com/talks/rita_pierson_every_k...,7469445,I have spent my entire life either at the scho...


## K-Means Clustering

In [16]:
from sklearn.cluster import KMeans

k_clusters = 8

km = KMeans(n_clusters=k_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [17]:
from sklearn.externals import joblib

# Line below allows us to save our model
# joblib.dump(km, 'doc_cluster_transcript.pkl')

# Line below allows us to load a saved model
km = joblib.load('doc_cluster_transcript.pkl')

clusters = km.labels_.tolist()

In [19]:
# Aggregate data
talks = {'title': ted['title'],
         'tags': ted['tags'],
         'views': ted['views'],
         'cluster': clusters}

ted_clustered = pd.DataFrame(talks)
# display(ted_clustered)

### Interesting Metrics to Explore

In [20]:
# How many talks in each cluster?
pd.Series(clusters).value_counts()

3    747
1    725
0    397
2    207
5    125
7     91
4     86
6     83
dtype: int64

In [21]:
# Comparing clusters with views
grouped = ted_clustered['views'].groupby(ted_clustered['cluster'])

grouped.mean()

cluster
0    1.240805e+06
1    1.413166e+06
2    1.283570e+06
3    2.513875e+06
4    2.337126e+06
5    1.403045e+06
6    1.790145e+06
7    1.112356e+06
Name: views, dtype: float64

Looks like clusters 3 and 4 get the most views by far. We can extract the keywords from these clusters to understand why they get the most views. 

### Top Terms Per Cluster
This will give us an idea as to what the main topic of the cluster is

In [22]:
# Order cluster centers by proximity to origin
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(k_clusters):
    print("Cluster " + str(i) + " words:")
    
    for ind in order_centroids[i, :6]:
        print(vocab_df.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'))
    print('')
    print('')


Cluster 0 words:
b'country'
b'government'
b'percent'
b'needs'
b'states'
b'global'


Cluster 1 words:
b'actually'
b'designed'
b'technology'
b'human'
b'computer'
b'data'


Cluster 2 words:
b'water'
b'planet'
b'earth'
b'food'
b'energy'
b'life'


Cluster 3 words:
b'laughter'
b'said'
b'say'
b'life'
b'got'
b'love'


Cluster 4 words:
b'brain'
b'actually'
b'human'
b'active'
b'different'
b'right'


Cluster 5 words:
b'women'
b'men'
b'woman'
b'mother'
b'said'
b'story'


Cluster 6 words:
b'music'
b'play'
b'sound'
b'laughter'
b'thank'
b'hears'


Cluster 7 words:
b'city'
b'buildings'
b'space'
b'new'
b'street'
b'designed'




We can look at the top clusters: **Cluster 3** has associated with it words such as life, love, and laughter. 
**Cluster 4** has associated with it words such as brain, different and human. 

We're now starting to form a picture of what these clusters are about. 