In [7]:
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install nltk
import nltk
import string
import warnings
from scipy.stats import pearsonr
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('ted_talks.csv')
print(df.head(10))

   Index                 main_speaker  \
0      1              Gaya Herrington   
1      2                   Jon M. Chu   
2      3              Nicky Trevorrow   
3      4            Scott Hershberger   
4      5                 Noah Charney   
5      6               Agnes Kalibata   
6      7               Scott Galloway   
7      8                Hartmut Neven   
8      9  Alex Luebke, Vivek Kumbhari   
9     10              David Friedberg   

                                               title  \
0   Will the end of economic growth come by desig...   
1               Why creativity thrives on challenges   
2        How to make your cat happier — in 3 minutes   
3               How did the Milky Way get its shape?   
4  The tragedy of the one guy who was right about...   
5    How to empower farmers — and nourish the planet   
6  Why young people are worse off than their pare...   
7  Quantum computers aren't what you think — they...   
8  How you could see inside your body — with

In [9]:
df.shape

(4472, 7)

In [10]:
df.isnull().sum()

Index              0
main_speaker       0
title              0
details            0
posted             0
url                0
Unnamed: 6      4465
dtype: int64

In [12]:
df=df.drop(["Unnamed: 6"],axis=1)
df.head(10)

Unnamed: 0,Index,main_speaker,title,details,posted,url
0,1,Gaya Herrington,Will the end of economic growth come by desig...,"What if solving poverty, caring for nature and...",Posted Oct 2024,https://www.ted.com/talks/gaya_herrington_will...
1,2,Jon M. Chu,Why creativity thrives on challenges,Filmmaker Jon M. Chu has enjoyed an incredible...,Posted Oct 2024,https://www.ted.com/talks/jon_m_chu_why_creati...
2,3,Nicky Trevorrow,How to make your cat happier — in 3 minutes,There's a simple way to increase your cat's ha...,Posted Sep 2024,https://www.ted.com/talks/nicky_trevorrow_how_...
3,4,Scott Hershberger,How did the Milky Way get its shape?,"Thirteen billion years ago, the gas and dust p...",Posted Sep 2024,https://www.ted.com/talks/scott_hershberger_ho...
4,5,Noah Charney,The tragedy of the one guy who was right about...,"Laocoön, a seer and priest, was deeply suspici...",Posted Aug 2024,https://www.ted.com/talks/noah_charney_the_tra...
5,6,Agnes Kalibata,How to empower farmers — and nourish the planet,Africa's smallholder farmers feed millions of ...,Posted Aug 2024,https://www.ted.com/talks/agnes_kalibata_how_t...
6,7,Scott Galloway,Why young people are worse off than their pare...,"In this special conversation, NYU marketing pr...",Posted Jul 2024,https://www.ted.com/talks/scott_galloway_why_y...
7,8,Hartmut Neven,Quantum computers aren't what you think — they...,Quantum computers obtain superpowers by tappin...,Posted Jul 2024,https://www.ted.com/talks/hartmut_neven_quantu...
8,9,"Alex Luebke, Vivek Kumbhari",How you could see inside your body — with a mi...,Would you swallow a micro-robot? In a gutsy de...,Posted Jun 2024,https://www.ted.com/talks/alex_luebke_vivek_ku...
9,10,David Friedberg,A scientific breakthrough that could transform...,Agriculture fundamentally changed the way huma...,Posted Jun 2024,https://www.ted.com/talks/david_friedberg_a_sc...


In [13]:
df.isnull().sum()

Index           0
main_speaker    0
title           0
details         0
posted          0
url             0
dtype: int64

In [17]:
splitted = df['posted'].str.split(' ', expand=True)

# Creating columns for month and year of the talk
df['year'] = splitted[2].astype('int')
df['year']

0       2024
1       2024
2       2024
3       2024
4       2024
        ... 
4467    2006
4468    2006
4469    2006
4470    2006
4471    2006
Name: year, Length: 4472, dtype: int64

In [19]:
# Let's combine the title and the details of the talk.
df['details'] = df['title'] + ' ' + df['details']

# Removing the unnecessary information
df = df[['main_speaker', 'details']]
df.dropna(inplace = True)
df.head()

Unnamed: 0,main_speaker,details
0,Gaya Herrington,Will the end of economic growth come by desig...
1,Jon M. Chu,Why creativity thrives on challenges Filmmaker...
2,Nicky Trevorrow,How to make your cat happier — in 3 minutes Th...
3,Scott Hershberger,How did the Milky Way get its shape? Thirteen ...
4,Noah Charney,The tragedy of the one guy who was right about...


In [20]:
data = df.copy()

In [22]:
df['details'] = df['details'].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,main_speaker,details
0,Gaya Herrington,end economic growth come design — disaster? so...
1,Jon M. Chu,creativity thrives challenges filmmaker jon m....
2,Nicky Trevorrow,make cat happier — 3 minutes there's simple wa...
3,Scott Hershberger,milky way get shape? thirteen billion years ag...
4,Noah Charney,"tragedy one guy right trojan horse laocoön, se..."


In [23]:
punctuations_list = string.punctuation


def cleaning_punctuations(text):
    signal = str.maketrans('', '', punctuations_list)
    return text.translate(signal)


df['details'] = df['details'].apply(lambda x: cleaning_punctuations(x))
df.head()

Unnamed: 0,main_speaker,details
0,Gaya Herrington,end economic growth come design — disaster sol...
1,Jon M. Chu,creativity thrives challenges filmmaker jon m ...
2,Nicky Trevorrow,make cat happier — 3 minutes theres simple way...
3,Scott Hershberger,milky way get shape thirteen billion years ago...
4,Noah Charney,tragedy one guy right trojan horse laocoön see...


In [36]:
%%capture
vectorizer = TfidfVectorizer(analyzer = 'word')
vectorizer.fit(df['details'])

In [46]:
def get_similarities(talk_content, data=data):

    # Getting vector for the input talk_content.
    talk_array1 = vectorizer.transform(talk_content).toarray()

    # We will store similarity for each row of the dataset.
    sim = []
    pea = []
    for idx, row in data.iterrows():
        details = row['details']

        # Getting vector for current talk.
        talk_array2 = vectorizer.transform(
            data[data['details'] == details]['details']).toarray()

        # Calculating cosine similarities
        cos_sim = cosine_similarity(talk_array1, talk_array2)[0][0]

        # Calculating pearson correlation
        pea_sim = pearsonr(talk_array1.squeeze(), talk_array2.squeeze())[0]

        sim.append(cos_sim)
        pea.append(pea_sim)

    return sim, pea


In [50]:
def recommend_talks(talk_content, data=data):

    data['cos_sim'], data['pea_sim'] = get_similarities(talk_content)

    data.sort_values(by=['cos_sim', 'pea_sim'], ascending=[
                     False, False], inplace=True)

    display(data[['main_speaker', 'details']].head())

In [51]:
talk_content = ['AI']
recommend_talks(talk_content)

Unnamed: 0,main_speaker,details
1636,Stuart Russell,3 principles for creating safer AI How can we ...
1442,Philipp Gerbert,The basics of AI for business On the TED@BCG s...
125,Sylvain Duranton,How humans and AI can work together to create ...
88,Rajeev Ronanki,"For future AI, the prompt is you Technologist ..."
1027,Kai-Fu Lee,How AI can save our humanity AI is massively t...


In [52]:
talk_content = ['Trojan']
recommend_talks(talk_content)

Unnamed: 0,main_speaker,details
4,Noah Charney,The tragedy of the one guy who was right about...
1032,Einav Zamir Dembin,Did ancient Troy really exist? When Homer's Il...
2083,TED-Ed,Year In Ideas 2015 The Year In Ideas 2015.
1250,Isaac Mizrahi,How the button changed fashion How the simple ...
4386,Jill Sobule,"Global warming's theme song, ""Manhattan in Jan..."
