#### TFIDF

In [6]:
# loading libraries
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
import nltk
import re
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from collections import Counter
import ast

In [7]:
# importing data
ted_main = pd.read_csv('ted_main.csv')
ted_main['tags'] = ted_main['tags'].apply(lambda x: ast.literal_eval(x))
transcripts = pd.read_csv('transcripts.csv')
ted_merged = pd.merge(left=transcripts,
                      right=ted_main,
                      left_on='url',
                      right_on='url')
transcript = ted_merged.transcript

In [8]:
def tokenize(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
#    stems = [stemmer.stem(t) for t in filtered_tokens]
    return filtered_tokens

In [13]:
doc = transcript.tolist()
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(doc) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

Wall time: 1min 29s
(2467, 364)


####  Spectural Clustering

In [14]:
n_cluster = range(2,11)
best_param = []
list_score = []
for n in n_cluster:
    model = SpectralClustering(n_clusters=n)
    model.fit(tfidf_matrix)
    label = model.labels_
    list_score.append(silhouette_score(tfidf_matrix, label))
list_score = np.array(list_score)
best_param.append(n_cluster[list_score.argmax()])
print(best_param)

[8]


In [15]:
model = SpectralClustering(n_clusters=8)
model.fit(tfidf_matrix)
label = model.labels_
clusters = label.tolist()
Counter(clusters)

Counter({0: 266, 2: 166, 1: 361, 6: 578, 5: 770, 7: 82, 4: 156, 3: 88})

#### KMeans Clustering

In [217]:
n_cluster = list(range(2,11))
param_grid = {'n_clusters': n_cluster}
kmeans = KMeans()
kmeans_cv = GridSearchCV(kmeans, param_grid)
kmeans_cv.fit(tfidf_matrix)
print("Tuned Kmeans Parameter: {}".format(kmeans_cv.best_params_))

Tuned Kmeans Parameter: {'n_clusters': 10}


In [228]:
km_model = KMeans(n_clusters=8)
km_model.fit(tfidf_matrix)
km_label = km_model.labels_
km_clusters = km_label.tolist()
Counter(km_clusters)

Counter({5: 248, 4: 451, 6: 368, 2: 450, 1: 544, 7: 104, 0: 182, 3: 120})

In [32]:
import warnings
warnings.filterwarnings("ignore")
ted_merged['cluster'] = clusters
ted_w_cluster = ted_merged[['title','transcript','tags','cluster']]
ted_w_cluster[ted_w_cluster['cluster']==7][:50]

Unnamed: 0,title,transcript,tags,cluster
23,"Why we love, why we cheat",I'd like to talk today about the two biggest s...,"[cognitive science, culture, evolution, gender...",7
24,Happiness in body and soul,I bet you're worried.(Laughter)I was worried. ...,"[culture, entertainment, gender, global issues...",7
101,"""Black Men Ski""","Has anyone ever been to Aspen, Colorado? It's ...","[culture, entertainment, live music, music, pe...",7
169,Tales of passion,Thank you so much. It's really scary to be her...,"[South America, entertainment, global issues, ...",7
289,What security means to me,I think it'll be a relief to some people and a...,"[activism, culture, global issues, spoken word...",7
449,"A passionate, personal case for education","This is my first trip, my first foreign trip a...","[culture, education, global issues, politics, ...",7
541,The surprising spread of Idol TV,"I'd like to ask you, what do these three peopl...","[culture, entertainment, global issues, poetry...",7
562,Photographing the hidden story,"My name is Ryan Lobo, and I've been involved i...","[Asia, art, humanity, photography, storytellin...",7
643,"Radical women, embracing tradition",Salaam. Namaskar. Good morning. Given my TED p...,"[culture, feminism, social change, women]",7
782,"Women, wartime and the dream of peace",I woke up in the middle of the night with the ...,"[drones, global issues, iraq, war, women]",7


In [31]:
ted_w_cluster

Unnamed: 0,title,transcript,tags,cluster
0,Do schools kill creativity?,Good morning. How are you?(Laughter)It's been ...,"[children, creativity, culture, dance, educati...",0
1,Averting the climate crisis,"Thank you so much, Chris. And it's truly a gre...","[alternative energy, cars, climate change, cul...",0
2,Simplicity sells,"(Music: ""The Sound of Silence,"" Simon & Garfun...","[computers, entertainment, interface design, m...",0
3,Greening the ghetto,If you're here today — and I'm very happy that...,"[MacArthur grant, activism, business, cities, ...",2
4,The best stats you've ever seen,"About 10 years ago, I took on the task to teac...","[Africa, Asia, Google, demo, economics, global...",1
5,Why we do what we do,Thank you. I have to tell you I'm both challen...,"[business, culture, entertainment, goal-settin...",6
6,Letting go of God,"On September 10, the morning of my seventh bir...","[Christianity, God, atheism, comedy, culture, ...",0
7,Behind the design of Seattle's library,I'm going to present three projects in rapid f...,"[architecture, collaboration, culture, design,...",2
8,Let's teach religion -- all religion -- in sch...,It's wonderful to be back. I love this wonderf...,"[God, TED Brain Trust, atheism, brain, cogniti...",5
9,A life of purpose,"I'm often asked, ""What surprised you about the...","[Christianity, God, culture, happiness, leader...",6


In [18]:
c0_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 0]['tags'].tolist() for item in sub_list]
c1_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 1]['tags'].tolist() for item in sub_list]
c2_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 2]['tags'].tolist() for item in sub_list]
c3_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 3]['tags'].tolist() for item in sub_list]
c4_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 4]['tags'].tolist() for item in sub_list]
c5_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 5]['tags'].tolist() for item in sub_list]
c6_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 6]['tags'].tolist() for item in sub_list]
c7_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 7]['tags'].tolist() for item in sub_list]
# c8_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 8]['tags'].tolist() for item in sub_list]
# c9_tag = [item for sub_list in ted_w_cluster[ted_w_cluster.cluster == 9]['tags'].tolist() for item in sub_list]

In [19]:
c0_tag_stat = pd.Series(Counter(c0_tag))
c1_tag_stat = pd.Series(Counter(c1_tag))
c2_tag_stat = pd.Series(Counter(c2_tag))
c3_tag_stat = pd.Series(Counter(c3_tag))
c4_tag_stat = pd.Series(Counter(c4_tag))
c5_tag_stat = pd.Series(Counter(c5_tag))
c6_tag_stat = pd.Series(Counter(c6_tag))
c7_tag_stat = pd.Series(Counter(c7_tag))
# c8_tag_stat = pd.Series(Counter(c8_tag))
# c9_tag_stat = pd.Series(Counter(c9_tag))

In [20]:
print(c0_tag_stat.nlargest(10))
print ("")
print (c1_tag_stat.nlargest(10))
print ("")
print (c2_tag_stat.nlargest(10))
print ("")
print (c3_tag_stat.nlargest(10))
print ("")
print (c4_tag_stat.nlargest(10))
print ("")
print (c5_tag_stat.nlargest(10))
print ("")
print(c6_tag_stat.nlargest(10))
print ("")
print(c7_tag_stat.nlargest(10))
print ("")
# print(c8_tag_stat.nlargest(10))
# print ("")
# print(c9_tag_stat.nlargest(10))
# print ("")

entertainment    87
culture          75
humor            64
technology       51
TEDx             49
science          39
performance      39
music            36
comedy           36
design           36
dtype: int64

global issues    198
business         115
economics         91
technology        75
politics          59
TEDx              54
culture           52
social change     52
health            49
society           49
dtype: int64

design           106
cities            65
architecture      55
technology        42
culture           32
art               32
collaboration     24
creativity        23
business          22
innovation        22
dtype: int64

technology       53
data             38
science          25
health           17
TEDx             16
communication    13
business         12
computers        12
global issues    12
medicine         12
dtype: int64

science          86
technology       57
environment      50
exploration      36
nature           30
TEDx             30
desi