# Data Wrangling

In [1]:
import pandas as pd
import numpy as npf

In [2]:
ted_data = pd.read_csv("ted_main.csv")
youtube_data = pd.read_csv("ted_youtube.csv")

In [3]:
ted_data.drop(['description','film_date','published_date','url'],axis=1,inplace=True)

> Cleaning the dataframe by _dropping and renaming_ the columns

In [4]:
ted_data=ted_data.rename({'comments':'ted_nof_comments','duration':'ted_duration','event':'ted_event','languages':'ted_avlb_lang','published_dt':
                'ted_published_dt','ratings':'ted_ratings','related_talks':'ted_talks_related','tags':'ted_tags','title':'ted_title','views':'ted_views'},axis=1)


In [5]:
ted_data=ted_data.rename({'comments':'ted_nof_comments','duration':'ted_duration','event':'ted_event','languages':'ted_avlb_lang','published_dt':
                'ted_published_dt','ratings':'ted_ratings','related_talks':'ted_talks_related','tags':'ted_tags','title':'ted_title','views':'ted_views'},axis=1)

In [6]:
youtube_data = youtube_data.rename({'view_count':'youtube_views','duration':'youtube_duration','categories':'youtube_categories','tags':'youtube_tags',
                    'dislike_count':'youtube_dislikes','like_count':'youtube_likes','average_rating':'youtube_avg_ratings','title':'youtube_title'},axis=1)

In [7]:
speaker_occupation_na = ['Writer, educator, CEO','Entrepreneur, Politician','Technologist, CEO','Musician','Artist','Photographer, Artist, Activist']

> ## Replacing NA 

Handling NA values is an important step in Data Wrangling, youtube_likes and youtube_dislikes columns contains some NA values ad these are numbers we are replacing them with zeros

In [8]:
youtube_data['youtube_likes']=youtube_data['youtube_likes'].fillna(0)
youtube_data['youtube_dislikes']=youtube_data['youtube_dislikes'].fillna(0)

> ## Convert  spanish text to ASCII> 

Columns main_speaker,name and ted_title have spanish scripts in some of the observations so encoding everything to ascii is necessary.

In [9]:
ted_data['main_speaker'] = ted_data['main_speaker'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
ted_data['name'] = ted_data['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
ted_data['ted_title'] = ted_data['ted_title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [10]:
youtube_data['youtube_title'] = youtube_data['youtube_title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

> ## Hurdles for merging TED and Youtube data

The main challenge is in merging youtube data and TED data, there is no any common field like Video ID.
*  the details are onlly the title name and speaker names. 
*  Titles are not exactly alike in both dataset. 
*  As there is a chance that one speaker delivered more than one titles, we cannot match only with speaker names, so merging based on titles is the best bet.
*  The format of the title is completely different, TED data contains title alone,but youtube data have 'title|speaker' or 'speaker|title' as formats.
*  So panda.series.string functions like concatenation,merge,replace and strip are used to separte the titles and merge the data

In [11]:
youtube_data['melt1'] = youtube_data.youtube_title.str.split(':')

youtube_data['talk1']=youtube_data['melt1'].str.get(1)
youtube_data['author1']=youtube_data['melt1'].str.get(0)

youtube_data['melt2'] = youtube_data.youtube_title.str.split('|')

youtube_data['talk2']=youtube_data['melt2'].str.get(0)
youtube_data['author2']=youtube_data['melt2'].str.get(1)

In [12]:
youtube_data['talk3']= youtube_data.youtube_title[(youtube_data.author2.isna()) & (youtube_data.talk1.isna())]
youtube_data['talk3']=youtube_data['talk3'].fillna(youtube_data['author1'].str.cat(youtube_data['talk1'],sep=' '))
youtube_data['talk3']=youtube_data['talk3'].fillna(youtube_data['author2'].str.cat(youtube_data['talk2'],sep=' '))

In [13]:
youtube_data['talk3']=youtube_data['talk3'].str.strip()
youtube_data['talk3']=youtube_data['talk3'].str.lower()

In [14]:
youtube_data['split']= youtube_data.talk2[(youtube_data['talk1'].isna())&(youtube_data['author2'].isna())].str.split(' ')
youtube_data['author3']=youtube_data['split'].str.get(0).str.cat(youtube_data['split'].str.get(1),sep=' ')

In [15]:
youtube_data['author3'] = youtube_data['author3'].fillna(youtube_data['author2'])
youtube_data['author3'] = youtube_data['author3'].fillna(youtube_data['author1'])
youtube_data['talk3']=youtube_data['talk3'].str.replace('withenglishsubtitles','')
youtube_data['talk3']=youtube_data['talk3'].str.replace(',','').str.replace('?','').str.replace('|','').str.replace(':','').str.replace('"','').str.replace('!','').str.replace('-','')

In [16]:
ted_data['ted_title']=ted_data['ted_title'].str.replace(',','').str.replace('?','').str.replace('|','').str.replace(':','').str.replace('"','').str.replace('!','').str.replace('-','')
ted_data['ted_title']=ted_data['ted_title'].str.replace(':','')
#ted_data['name1']=ted_data['name1'].str.replace('[^A-Za-z0-9]+','')
ted_data['ted_title']=ted_data['ted_title'].str.strip()
ted_data['ted_title']=ted_data['ted_title'].str.lower()

In [17]:
youtube_data['talk_cont']=youtube_data['talk3'].str.replace('[^A-Za-z0-9]+','')
ted_data['talk_cont']=ted_data['name'].str.replace('[^A-Za-z0-9]+','')

ted_data['name_slice']= ted_data['talk_cont'].str.slice(start=0,stop=25)
youtube_data['talk_slice']=youtube_data['talk_cont'].str.slice(start=0,stop=25)

In [18]:
youtube_data['author3'] = youtube_data['author3'].str.strip()
youtube_data['author3'] = youtube_data['author3'].str.lower()
#ted_data['main_speaker'] = below new
youtube_data['author3'] = youtube_data['author3'].str.replace("'s",'')
youtube_data['author3'] = youtube_data['author3'].str.replace("&",'and')
ted_data['main_speaker'] = ted_data['main_speaker'].str.lower()
ted_data['main_speaker'] = ted_data['main_speaker'].str.strip()
ted_data['main_speaker'] = ted_data['main_speaker'].str.replace('+','and')

#ted_data['main_speaker'] = ted_data['main_speaker'].str.replace('a','and')


#ted_data['main_speaker'] = ted_data['main_speaker'].str.replace('a','and')
youtube_data['author3'] = youtube_data['author3'].str.replace("&",'and')

ted_data['main_speaker'] = ted_data['main_speaker'].str.replace(' and ','')
youtube_data['author3'] = youtube_data['author3'].str.replace(' and ','')

In [19]:
ted_data['name_slice']=ted_data['name_slice'].str.strip()
ted_data['name_slice']=ted_data['name_slice'].str.lower()

In [20]:
title_merge = pd.merge(ted_data,youtube_data,left_on='name_slice',right_on='talk_slice')

In [21]:
title_merge[title_merge.duplicated(['name'], keep=False)]

Unnamed: 0,ted_nof_comments,ted_duration,ted_event,film_dt,ted_avlb_lang,main_speaker,name,num_speaker,ted_published_dt,ted_ratings,...,talk1,author1,melt2,talk2,author2,talk3,split,author3,talk_cont_y,talk_slice
127,383,1396,TED2008,2/2/2008,33,philip zimbardo,Philip Zimbardo: The psychology of evil,1,9/23/2008,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 7...",...,,The psychology of evil | Philip Zimbardo,"[The psychology of evil , Philip Zimbardo]",The psychology of evil,Philip Zimbardo,philip zimbardo the psychology of evil,,philip zimbardo,philipzimbardothepsychologyofevil,philipzimbardothepsycholo
128,383,1396,TED2008,2/2/2008,33,philip zimbardo,Philip Zimbardo: The psychology of evil,1,9/23/2008,"[{'id': 23, 'name': 'Jaw-dropping', 'count': 7...",...,The psychology of time,Philip Zimbardo,[Philip Zimbardo: The psychology of time],Philip Zimbardo: The psychology of time,,philip zimbardo the psychology of time,,philip zimbardo,philipzimbardothepsychologyoftime,philipzimbardothepsycholo
129,166,394,TED2009,2/6/2009,37,philip zimbardo,Philip Zimbardo: The psychology of time,1,6/22/2009,"[{'id': 24, 'name': 'Persuasive', 'count': 389...",...,,The psychology of evil | Philip Zimbardo,"[The psychology of evil , Philip Zimbardo]",The psychology of evil,Philip Zimbardo,philip zimbardo the psychology of evil,,philip zimbardo,philipzimbardothepsychologyofevil,philipzimbardothepsycholo
130,166,394,TED2009,2/6/2009,37,philip zimbardo,Philip Zimbardo: The psychology of time,1,6/22/2009,"[{'id': 24, 'name': 'Persuasive', 'count': 389...",...,The psychology of time,Philip Zimbardo,[Philip Zimbardo: The psychology of time],Philip Zimbardo: The psychology of time,,philip zimbardo the psychology of time,,philip zimbardo,philipzimbardothepsychologyoftime,philipzimbardothepsycholo
1701,59,551,TED@BCG Paris,5/18/2016,36,shubhendu sharma,Shubhendu Sharma: How to grow a forest in your...,1,7/14/2016,"[{'id': 8, 'name': 'Informative', 'count': 680...",...,,How to grow a forest in your backyard | Shubhe...,"[How to grow a forest in your backyard , Shub...",How to grow a forest in your backyard,Shubhendu Sharma,shubhendu sharma how to grow a forest in your ...,,shubhendu sharma,shubhendusharmahowtogrowaforestinyourbackyard,shubhendusharmahowtogrowa
1702,59,551,TED@BCG Paris,5/18/2016,36,shubhendu sharma,Shubhendu Sharma: How to grow a forest in your...,1,7/14/2016,"[{'id': 8, 'name': 'Informative', 'count': 680...",...,How to grow a tiny forest anywhere,Shubhendu Sharma,[Shubhendu Sharma: How to grow a tiny forest a...,Shubhendu Sharma: How to grow a tiny forest an...,,shubhendu sharma how to grow a tiny forest an...,,shubhendu sharma,shubhendusharmahowtogrowatinyforestanywhere,shubhendusharmahowtogrowa


In [22]:
title_merge = title_merge.drop(index=[126,128,1697],axis=0)

In [23]:
ted_minus = ted_data[ted_data.ted_title.isin(title_merge.ted_title) == False]
youtube_minus = youtube_data[youtube_data.youtube_title.isin(title_merge.youtube_title) == False]

> Even after merging based on titles and speakers, there are __several hundreds of titles does not have exact words__ in titke descriptions, so the words are __tokenized by nltk package__ for further finding similarity

In [24]:
import nltk
from nltk import word_tokenize 
youtube_minus['tokens_u']= youtube_minus['talk3'].apply(nltk.word_tokenize)
ted_minus['tokens']=ted_minus['ted_title'].apply(nltk.word_tokenize)
youtube_minus['tokens']=youtube_minus['tokens_u'].str.slice(start=2,stop=None,step=None)

youtube_minus['sp_tokens']= youtube_minus['author3'].apply(nltk.word_tokenize)
ted_minus['sp_tokens']=ted_minus['main_speaker'].apply(nltk.word_tokenize)


from collections import Counter
youtube_minus['y_talk_co'] = youtube_minus.apply(lambda g:Counter(g['tokens_u']),axis=1)

ted_minus['t_talk_co'] = ted_minus.apply(lambda g:Counter(g['tokens']),axis=1)

youtube_minus['y_sp_co'] = youtube_minus.apply(lambda g:Counter(g['sp_tokens']),axis=1)
ted_minus['t_sp_co'] = ted_minus.apply(lambda g:Counter(g['sp_tokens']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [25]:
ted_single_talk = ted_minus.groupby('main_speaker').filter(lambda g:g['ted_title'].count() == 1)
you_single_talk = youtube_minus.groupby('author3').filter(lambda g:g['youtube_title'].count() == 1)

In [26]:
ted_multi_talk = ted_minus.groupby('main_speaker').filter(lambda g:g['ted_title'].count() > 1)
youtube_multi_talk = youtube_minus.groupby('author3').filter(lambda g:g['youtube_title'].count() > 1)

In [27]:
single_talk_merge_main = pd.merge(ted_single_talk,you_single_talk,left_on='main_speaker',right_on='author3')

In [28]:
ted_single_talk_minus = ted_single_talk[ted_single_talk.main_speaker.isin(single_talk_merge_main.main_speaker)==False]
youtube_single_talk_minus = you_single_talk[you_single_talk.author3.isin(single_talk_merge_main.author3)==False]

> ## Similarity Measure - Cosine

*  __Cosine Similarity__ is used to find the similarity between tokenized texts.

In [29]:
from scipy.spatial.distance import cosine
def second_function(ted_talk,youtube_talk,ted_speaker,youtube_speaker):
   # print(ted_talk,youtube_talk,ted_speaker,youtube_speaker)
    speak_concat = list(ted_speaker.keys()| youtube_speaker.keys())
    c_vect = [ted_speaker.get(word, 0) for word in speak_concat]       
    d_vect = [youtube_speaker.get(word, 0) for word in speak_concat]
   # print(c_vect,d_vect)
    cos1 = 1 - cosine(c_vect,d_vect)
   # print(cos1)
    if cos1 > 0.4:
       # print('enter')
        words  = list(ted_talk.keys() | youtube_talk.keys())
        e_vect = [ted_talk.get(word, 0) for word in words]
        f_vect = [youtube_talk.get(word, 0) for word in words]
        cost = 1 - cosine(e_vect,f_vect)
       # print(cost)
        return cost

In [30]:
single_talk = ted_single_talk_minus.apply(lambda x:youtube_single_talk_minus.apply(lambda y: second_function(x['t_talk_co'],y['y_talk_co'],x['t_sp_co'],y['y_sp_co']),axis =1),axis=1).rename(columns = youtube_single_talk_minus['youtube_title'],
                                                                                          index = ted_single_talk_minus['name'])

In [31]:
single_talk.reset_index(inplace = True)
single_talk_similarity= pd.melt(single_talk,id_vars= 'index')

In [32]:
single_talk_similar = single_talk_similarity[single_talk_similarity['value'] > 0.4]

In [33]:
single_talk_similar.rename(columns={'variable':'ytitle'}, inplace=True)
single_talk_similar.rename(columns={'index':'ttitle'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [34]:
new = []

for index,row in single_talk_similar.iterrows():
    ted_dat = ted_single_talk_minus[ted_single_talk_minus.name == row['ttitle']]
    #print('ted',ted_dat)
    #print(type(ted_dat))
    you_dat = youtube_single_talk_minus[(youtube_single_talk_minus['youtube_title'] == row['ytitle'])]
   # print('you',you_dat)
   # print(type(you_dat))
    ted_dat.reset_index(drop=True, inplace=True)
    you_dat.reset_index(drop=True, inplace=True)
    df = pd.concat([ted_dat,you_dat],axis =1)
   # print('concat',df)
   # print(type(df))
   # print('df',df)
    #break
  #  new.append(df,ignore_index=True)
    new.append(df)
    #print(new)
   # break

In [35]:
single_talk_merge = pd.concat(new)

In [36]:
utube_rem_after_single_merge = youtube_single_talk_minus[youtube_single_talk_minus.author3.isin(single_talk_merge.author3)==False]

In [37]:
multi_talk = ted_multi_talk.apply(lambda x:youtube_multi_talk.apply(lambda y: second_function(x['t_talk_co'],y['y_talk_co'],x['t_sp_co'],y['y_sp_co']),axis =1),axis=1).rename(columns = youtube_multi_talk['youtube_title'],
                                                                                          index = ted_multi_talk['name'])

In [38]:
multi_talk.reset_index(inplace = True)
multi_talk_similarity= pd.melt(multi_talk,id_vars= 'index')


In [39]:
multi_talk_similar = multi_talk_similarity[multi_talk_similarity['value'] >0.3]

In [40]:
multi_talk_similar.rename(columns={'variable':'ytitle'}, inplace=True)
multi_talk_similar.rename(columns={'index':'ttitle'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [41]:
new1 = []

for index,row in multi_talk_similar.iterrows():
    ted_dat1 = ted_multi_talk[ted_multi_talk.name == row['ttitle']]
    #print('ted',ted_dat)
    #print(type(ted_dat))
    you_dat1 = youtube_multi_talk[(youtube_multi_talk['youtube_title'] == row['ytitle'])]
   # print('you',you_dat)
   # print(type(you_dat))
    ted_dat1.reset_index(drop=True, inplace=True)
    you_dat1.reset_index(drop=True, inplace=True)
    df2 = pd.concat([ted_dat1,you_dat1],axis =1)
   # print('concat',df)
   # print(type(df))
   # print('df',df)
    #break
  #  new.append(df,ignore_index=True)
    new1.append(df2)
    #print(new)
   # break

In [42]:
multi_merge = pd.concat(new1)

In [43]:
us = youtube_single_talk_minus[youtube_single_talk_minus.author3.isin(single_talk_merge.author3)==False]
um = youtube_multi_talk[youtube_multi_talk.talk3.isin(multi_merge.talk3)==False]

In [44]:
ts = ted_single_talk_minus[ted_single_talk_minus.main_speaker.isin(single_talk_merge.main_speaker)==False]
tm = ted_multi_talk[ted_multi_talk.ted_title.isin(multi_merge.ted_title)==False]

In [45]:
utube_rem = pd.concat([us,um],axis=0)
ted_rem = pd.concat([ts,tm],axis =0)

In [46]:
from scipy.spatial.distance import cosine
def third_function(ted_talk,youtube_talk,ted_speaker,youtube_speaker):
   # print(ted_talk,youtube_talk,ted_speaker,youtube_speaker)
    speak_concat = list(ted_speaker.keys()| youtube_speaker.keys())
    c_vect = [ted_speaker.get(word, 0) for word in speak_concat]       
    d_vect = [youtube_speaker.get(word, 0) for word in speak_concat]
   # print(c_vect,d_vect)
    cos1 = 1 - cosine(c_vect,d_vect)
   # print(cos1)
    if cos1 > 0.2:
       # print('enter')
        words  = list(ted_talk.keys() | youtube_talk.keys())
        e_vect = [ted_talk.get(word, 0) for word in words]
        f_vect = [youtube_talk.get(word, 0) for word in words]
        cost = 1 - cosine(e_vect,f_vect)
        if cost > 0.2:
            return cost

In [47]:
test = ted_rem.apply(lambda x:utube_rem.apply(lambda y: third_function(x['t_talk_co'],y['y_talk_co'],x['t_sp_co'],y['y_sp_co']),axis =1),axis=1).rename(columns = utube_rem['youtube_title'],
                                                                                          index = ted_rem['name'])

In [48]:
test.reset_index(inplace = True)
test1 = pd.melt(test,id_vars= 'index')

In [49]:
final = test1[test1['value'] > 0.55]

In [50]:
final.rename(columns={'variable':'ytitle'}, inplace=True)
final.rename(columns={'index':'ttitle'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [51]:
new3 = []

for index,row in final.iterrows():
    ted_dat3 = ted_rem[ted_rem.name == row['ttitle']]
    #print('ted',ted_dat)
    #print(type(ted_dat))
    you_dat3 = utube_rem[(utube_rem['youtube_title'] == row['ytitle'])]
   # print('you',you_dat)
   # print(type(you_dat))
    ted_dat3.reset_index(drop=True, inplace=True)
    you_dat3.reset_index(drop=True, inplace=True)
    df3 = pd.concat([ted_dat3,you_dat3],axis =1)
   # print('concat',df)
   # print(type(df))
   # print('df',df)
    #break
  #  new.append(df,ignore_index=True)
    new3.append(df3)
    #print(new)
   # break

In [52]:
rem_merge = pd.concat(new3)


In [53]:
merge1 = pd.concat([title_merge,single_talk_merge_main],axis=0)

In [54]:
merge3= pd.concat([single_talk_merge,multi_merge,rem_merge],axis=0)

In [55]:
merge3.columns[merge3.columns.duplicated()]

Index(['talk_cont', 'tokens', 'sp_tokens'], dtype='object')

In [56]:
single_talk_merge = single_talk_merge.loc[:,~single_talk_merge.columns.duplicated()]
multi_merge = multi_merge.loc[:,~multi_merge.columns.duplicated()]
rem_merge = rem_merge.loc[:,~rem_merge.columns.duplicated()]

> ## Final Merge

Youtube data and TED data are merged by
*  Titles with exact descriptions
*  Speakers with only one Talk
*  Tokenize and finding similarity between texts using cosine similarity
*  Merging all the data into one dataframe

In [57]:
final = pd.concat([title_merge,single_talk_merge_main,single_talk_merge,multi_merge,rem_merge],axis=0)

In [58]:
final

Unnamed: 0.1,Unnamed: 0,author1,author2,author3,description,film_dt,fulltitle,fulltitle_compare,id,main_speaker,...,y_sp_co,y_talk_co,youtube_avg_ratings,youtube_categories,youtube_dislikes,youtube_duration,youtube_likes,youtube_tags,youtube_title,youtube_views
0,0,Averting the climate crisis | Al Gore,Al Gore,al gore,http://www.ted.com With the same humor and hum...,2/25/2006,Averting the climate crisis | Al Gore,"['averting', 'climate', 'crisis', 'al', 'gore']",rDiGYuQicpA,al gore,...,,,4.046632,['Nonprofits & Activism'],184.0,1017,588.0,"['Al', 'Gore', 'TED', 'TEDTalks', 'Talks', 'cl...",Averting the climate crisis | Al Gore,162022
1,0,David Pogue,,david pogue,http://www.ted.com New York Times columnist Da...,2/24/2006,David Pogue: Simplicity sells,"['david', 'pogue', 'simplicity', 'sell']",NEjZt0y6OOw,david pogue,...,,,4.690756,['Science & Technology'],46.0,1326,549.0,"['David', 'Pogue', 'TED', 'TEDTalks', 'Talks',...",David Pogue: Simplicity sells,79874
2,0,Greening the ghetto | Majora Carter,Majora Carter,majora carter,http://www.ted.com In an emotionally charged ...,2/26/2006,Greening the ghetto | Majora Carter,"['greening', 'ghetto', 'majora', 'carter']",gQ-cZRmHfs4,majora carter,...,,,4.860330,['Nonprofits & Activism'],36.0,1156,995.0,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",Greening the ghetto | Majora Carter,130532
3,0,Why we do what we do | Tony Robbins,Tony Robbins,tony robbins,"Tony Robbins discusses the ""invisible forces"" ...",2/2/2006,Why we do what we do | Tony Robbins,"['tony', 'robbins']",Cpc-t-Uwv1I,tony robbins,...,,,4.810614,['Science & Technology'],3003.0,1351,60423.0,"['Tony Robbins', 'TED', 'TEDTalks', 'emotion',...",Why we do what we do | Tony Robbins,11424257
4,0,Letting go of God | Julia Sweeney,Julia Sweeney,julia sweeney,"http://www.ted.com Julia Sweeney (God Said, ""H...",2/24/2006,Letting go of God | Julia Sweeney,"['letting', 'go', 'god', 'julia', 'sweeney']",OtIyx687ytk,julia sweeney,...,,,4.639640,['Comedy'],430.0,1038,4343.0,"['TED', 'TEDTalks', 'Talks', 'atheist', 'athei...",Letting go of God | Julia Sweeney,448541
5,0,A life of purpose | Rick Warren,Rick Warren,rick warren,"http://www.ted.com Pastor, philanthropist and...",2/25/2006,A life of purpose | Rick Warren,"['life', 'purpose', 'rick', 'warren']",640BQNxB5mc,rick warren,...,,,4.175432,['Education'],1168.0,1308,4498.0,"['Rick', 'Warren', 'ted', 'tedtalks', 'Christi...",A life of purpose | Rick Warren,605052
6,0,Nicholas Negroponte,,nicholas negroponte,http://www.ted.com Nicholas Negroponte talks ...,2/23/2006,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,...,,,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
7,0,Nicholas Negroponte,,nicholas negroponte,http://www.ted.com Nicholas Negroponte talks ...,12/12/2007,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,...,,,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
8,0,Sirena Huang,,sirena huang,http://www.ted.com Violinist Sirena Huang give...,2/23/2006,Sirena Huang: An 11-year-old's magical violin,"['sirena', 'huang', 'year', 'old', 'magical', ...",#NAME?,sirena huang,...,,,4.850458,['Music'],147.0,1527,3785.0,"['Sirena', 'Huang', 'TED', 'TEDTalks', 'Talks'...",Sirena Huang: An 11-year-old's magical violin,514182
9,0,Jennifer Lin,,jennifer lin,http://www.ted.com Pianist and composer Jennif...,2/26/2004,"Jennifer Lin: Improvising on piano, aged 14","['jennifer', 'lin', 'improvising', 'piano', 'a...",UU0MX8epDro,jennifer lin,...,,,4.834970,['Music'],21.0,1491,488.0,"['Jennifer', 'Lin', 'TED', 'TEDTalks', 'Talks'...","Jennifer Lin: Improvising on piano, aged 14",73891


In [59]:
final.columns

Index(['Unnamed: 0', 'author1', 'author2', 'author3', 'description', 'film_dt',
       'fulltitle', 'fulltitle_compare', 'id', 'main_speaker', 'melt1',
       'melt2', 'name', 'name_slice', 'num_speaker', 'sp_tokens',
       'sp_tokens_x', 'sp_tokens_y', 'speaker_occupation', 'split', 't_sp_co',
       't_talk_co', 'talk1', 'talk2', 'talk3', 'talk_cont', 'talk_cont_x',
       'talk_cont_y', 'talk_slice', 'ted_avlb_lang', 'ted_duration',
       'ted_event', 'ted_nof_comments', 'ted_published_dt', 'ted_ratings',
       'ted_tags', 'ted_talks_related', 'ted_title', 'ted_views', 'tokens',
       'tokens_u', 'tokens_x', 'tokens_y', 'uploader_id', 'webpage_url',
       'y_sp_co', 'y_talk_co', 'youtube_avg_ratings', 'youtube_categories',
       'youtube_dislikes', 'youtube_duration', 'youtube_likes', 'youtube_tags',
       'youtube_title', 'youtube_views'],
      dtype='object')

In [60]:
final.drop(['Unnamed: 0', 'author1', 'author2', 'author3', 'description','melt1',
       'melt2','sp_tokens','sp_tokens_x', 'sp_tokens_y','split', 't_sp_co',
       't_talk_co', 'talk1', 'talk2','tokens',
       'tokens_u', 'tokens_x', 'tokens_y','y_sp_co', 'y_talk_co'],axis =1 , inplace = True)

In [61]:
final

Unnamed: 0,film_dt,fulltitle,fulltitle_compare,id,main_speaker,name,name_slice,num_speaker,speaker_occupation,talk3,...,uploader_id,webpage_url,youtube_avg_ratings,youtube_categories,youtube_dislikes,youtube_duration,youtube_likes,youtube_tags,youtube_title,youtube_views
0,2/25/2006,Averting the climate crisis | Al Gore,"['averting', 'climate', 'crisis', 'al', 'gore']",rDiGYuQicpA,al gore,Al Gore: Averting the climate crisis,algoreavertingtheclimatec,1,Climate advocate,al gore averting the climate crisis,...,TEDtalksDirector,https://www.youtube.com/watch?v=rDiGYuQicpA,4.046632,['Nonprofits & Activism'],184.0,1017,588.0,"['Al', 'Gore', 'TED', 'TEDTalks', 'Talks', 'cl...",Averting the climate crisis | Al Gore,162022
1,2/24/2006,David Pogue: Simplicity sells,"['david', 'pogue', 'simplicity', 'sell']",NEjZt0y6OOw,david pogue,David Pogue: Simplicity sells,davidpoguesimplicitysells,1,Technology columnist,david pogue simplicity sells,...,TEDtalksDirector,https://www.youtube.com/watch?v=NEjZt0y6OOw,4.690756,['Science & Technology'],46.0,1326,549.0,"['David', 'Pogue', 'TED', 'TEDTalks', 'Talks',...",David Pogue: Simplicity sells,79874
2,2/26/2006,Greening the ghetto | Majora Carter,"['greening', 'ghetto', 'majora', 'carter']",gQ-cZRmHfs4,majora carter,Majora Carter: Greening the ghetto,majoracartergreeningthegh,1,Activist for environmental justice,majora carter greening the ghetto,...,TEDtalksDirector,https://www.youtube.com/watch?v=gQ-cZRmHfs4,4.860330,['Nonprofits & Activism'],36.0,1156,995.0,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",Greening the ghetto | Majora Carter,130532
3,2/2/2006,Why we do what we do | Tony Robbins,"['tony', 'robbins']",Cpc-t-Uwv1I,tony robbins,Tony Robbins: Why we do what we do,tonyrobbinswhywedowhatwed,1,Life coach; expert in leadership psychology,tony robbins why we do what we do,...,TEDtalksDirector,https://www.youtube.com/watch?v=Cpc-t-Uwv1I,4.810614,['Science & Technology'],3003.0,1351,60423.0,"['Tony Robbins', 'TED', 'TEDTalks', 'emotion',...",Why we do what we do | Tony Robbins,11424257
4,2/24/2006,Letting go of God | Julia Sweeney,"['letting', 'go', 'god', 'julia', 'sweeney']",OtIyx687ytk,julia sweeney,Julia Sweeney: Letting go of God,juliasweeneylettinggoofgo,1,"Actor, comedian, playwright",julia sweeney letting go of god,...,TEDtalksDirector,https://www.youtube.com/watch?v=OtIyx687ytk,4.639640,['Comedy'],430.0,1038,4343.0,"['TED', 'TEDTalks', 'Talks', 'atheist', 'athei...",Letting go of God | Julia Sweeney,448541
5,2/25/2006,A life of purpose | Rick Warren,"['life', 'purpose', 'rick', 'warren']",640BQNxB5mc,rick warren,Rick Warren: A life of purpose,rickwarrenalifeofpurpose,1,"Pastor, author",rick warren a life of purpose,...,TEDtalksDirector,https://www.youtube.com/watch?v=640BQNxB5mc,4.175432,['Education'],1168.0,1308,4498.0,"['Rick', 'Warren', 'ted', 'tedtalks', 'Christi...",A life of purpose | Rick Warren,605052
6,2/23/2006,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,Nicholas Negroponte: One Laptop per Child,nicholasnegroponteonelapt,1,Tech visionary,nicholas negroponte one laptop per child two ...,...,TEDtalksDirector,https://www.youtube.com/watch?v=y_TKjfgjiQs,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
7,12/12/2007,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,"Nicholas Negroponte: One Laptop per Child, two...",nicholasnegroponteonelapt,1,Tech visionary,nicholas negroponte one laptop per child two ...,...,TEDtalksDirector,https://www.youtube.com/watch?v=y_TKjfgjiQs,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
8,2/23/2006,Sirena Huang: An 11-year-old's magical violin,"['sirena', 'huang', 'year', 'old', 'magical', ...",#NAME?,sirena huang,Sirena Huang: An 11-year-old's magical violin,sirenahuangan11yearoldsma,1,Violinist,sirena huang an 11yearold's magical violin,...,TEDtalksDirector,https://www.youtube.com/watch?v=-yOXsK5-SFY,4.850458,['Music'],147.0,1527,3785.0,"['Sirena', 'Huang', 'TED', 'TEDTalks', 'Talks'...",Sirena Huang: An 11-year-old's magical violin,514182
9,2/26/2004,"Jennifer Lin: Improvising on piano, aged 14","['jennifer', 'lin', 'improvising', 'piano', 'a...",UU0MX8epDro,jennifer lin,"Jennifer Lin: Improvising on piano, aged 14",jenniferlinimprovisingonp,1,"Pianist, composer",jennifer lin improvising on piano aged 14,...,TEDtalksDirector,https://www.youtube.com/watch?v=UU0MX8epDro,4.834970,['Music'],21.0,1491,488.0,"['Jennifer', 'Lin', 'TED', 'TEDTalks', 'Talks'...","Jennifer Lin: Improvising on piano, aged 14",73891


In [62]:
final.columns

Index(['film_dt', 'fulltitle', 'fulltitle_compare', 'id', 'main_speaker',
       'name', 'name_slice', 'num_speaker', 'speaker_occupation', 'talk3',
       'talk_cont', 'talk_cont_x', 'talk_cont_y', 'talk_slice',
       'ted_avlb_lang', 'ted_duration', 'ted_event', 'ted_nof_comments',
       'ted_published_dt', 'ted_ratings', 'ted_tags', 'ted_talks_related',
       'ted_title', 'ted_views', 'uploader_id', 'webpage_url',
       'youtube_avg_ratings', 'youtube_categories', 'youtube_dislikes',
       'youtube_duration', 'youtube_likes', 'youtube_tags', 'youtube_title',
       'youtube_views'],
      dtype='object')

In [63]:
final.drop(['name_slice','webpage_url'],axis=1,inplace=True)

In [64]:
final

Unnamed: 0,film_dt,fulltitle,fulltitle_compare,id,main_speaker,name,num_speaker,speaker_occupation,talk3,talk_cont,...,ted_views,uploader_id,youtube_avg_ratings,youtube_categories,youtube_dislikes,youtube_duration,youtube_likes,youtube_tags,youtube_title,youtube_views
0,2/25/2006,Averting the climate crisis | Al Gore,"['averting', 'climate', 'crisis', 'al', 'gore']",rDiGYuQicpA,al gore,Al Gore: Averting the climate crisis,1,Climate advocate,al gore averting the climate crisis,,...,3200520,TEDtalksDirector,4.046632,['Nonprofits & Activism'],184.0,1017,588.0,"['Al', 'Gore', 'TED', 'TEDTalks', 'Talks', 'cl...",Averting the climate crisis | Al Gore,162022
1,2/24/2006,David Pogue: Simplicity sells,"['david', 'pogue', 'simplicity', 'sell']",NEjZt0y6OOw,david pogue,David Pogue: Simplicity sells,1,Technology columnist,david pogue simplicity sells,,...,1636292,TEDtalksDirector,4.690756,['Science & Technology'],46.0,1326,549.0,"['David', 'Pogue', 'TED', 'TEDTalks', 'Talks',...",David Pogue: Simplicity sells,79874
2,2/26/2006,Greening the ghetto | Majora Carter,"['greening', 'ghetto', 'majora', 'carter']",gQ-cZRmHfs4,majora carter,Majora Carter: Greening the ghetto,1,Activist for environmental justice,majora carter greening the ghetto,,...,1697550,TEDtalksDirector,4.860330,['Nonprofits & Activism'],36.0,1156,995.0,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",Greening the ghetto | Majora Carter,130532
3,2/2/2006,Why we do what we do | Tony Robbins,"['tony', 'robbins']",Cpc-t-Uwv1I,tony robbins,Tony Robbins: Why we do what we do,1,Life coach; expert in leadership psychology,tony robbins why we do what we do,,...,20685401,TEDtalksDirector,4.810614,['Science & Technology'],3003.0,1351,60423.0,"['Tony Robbins', 'TED', 'TEDTalks', 'emotion',...",Why we do what we do | Tony Robbins,11424257
4,2/24/2006,Letting go of God | Julia Sweeney,"['letting', 'go', 'god', 'julia', 'sweeney']",OtIyx687ytk,julia sweeney,Julia Sweeney: Letting go of God,1,"Actor, comedian, playwright",julia sweeney letting go of god,,...,3769987,TEDtalksDirector,4.639640,['Comedy'],430.0,1038,4343.0,"['TED', 'TEDTalks', 'Talks', 'atheist', 'athei...",Letting go of God | Julia Sweeney,448541
5,2/25/2006,A life of purpose | Rick Warren,"['life', 'purpose', 'rick', 'warren']",640BQNxB5mc,rick warren,Rick Warren: A life of purpose,1,"Pastor, author",rick warren a life of purpose,,...,3095993,TEDtalksDirector,4.175432,['Education'],1168.0,1308,4498.0,"['Rick', 'Warren', 'ted', 'tedtalks', 'Christi...",A life of purpose | Rick Warren,605052
6,2/23/2006,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,Nicholas Negroponte: One Laptop per Child,1,Tech visionary,nicholas negroponte one laptop per child two ...,,...,358304,TEDtalksDirector,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
7,12/12/2007,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,"Nicholas Negroponte: One Laptop per Child, two...",1,Tech visionary,nicholas negroponte one laptop per child two ...,,...,398643,TEDtalksDirector,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
8,2/23/2006,Sirena Huang: An 11-year-old's magical violin,"['sirena', 'huang', 'year', 'old', 'magical', ...",#NAME?,sirena huang,Sirena Huang: An 11-year-old's magical violin,1,Violinist,sirena huang an 11yearold's magical violin,,...,2702470,TEDtalksDirector,4.850458,['Music'],147.0,1527,3785.0,"['Sirena', 'Huang', 'TED', 'TEDTalks', 'Talks'...",Sirena Huang: An 11-year-old's magical violin,514182
9,2/26/2004,"Jennifer Lin: Improvising on piano, aged 14","['jennifer', 'lin', 'improvising', 'piano', 'a...",UU0MX8epDro,jennifer lin,"Jennifer Lin: Improvising on piano, aged 14",1,"Pianist, composer",jennifer lin improvising on piano aged 14,,...,1628912,TEDtalksDirector,4.834970,['Music'],21.0,1491,488.0,"['Jennifer', 'Lin', 'TED', 'TEDTalks', 'Talks'...","Jennifer Lin: Improvising on piano, aged 14",73891


In [65]:
final.drop(['talk_cont', 'talk_cont_x', 'talk_cont_y'],axis=1,inplace=True)

In [66]:
final

Unnamed: 0,film_dt,fulltitle,fulltitle_compare,id,main_speaker,name,num_speaker,speaker_occupation,talk3,talk_slice,...,ted_views,uploader_id,youtube_avg_ratings,youtube_categories,youtube_dislikes,youtube_duration,youtube_likes,youtube_tags,youtube_title,youtube_views
0,2/25/2006,Averting the climate crisis | Al Gore,"['averting', 'climate', 'crisis', 'al', 'gore']",rDiGYuQicpA,al gore,Al Gore: Averting the climate crisis,1,Climate advocate,al gore averting the climate crisis,algoreavertingtheclimatec,...,3200520,TEDtalksDirector,4.046632,['Nonprofits & Activism'],184.0,1017,588.0,"['Al', 'Gore', 'TED', 'TEDTalks', 'Talks', 'cl...",Averting the climate crisis | Al Gore,162022
1,2/24/2006,David Pogue: Simplicity sells,"['david', 'pogue', 'simplicity', 'sell']",NEjZt0y6OOw,david pogue,David Pogue: Simplicity sells,1,Technology columnist,david pogue simplicity sells,davidpoguesimplicitysells,...,1636292,TEDtalksDirector,4.690756,['Science & Technology'],46.0,1326,549.0,"['David', 'Pogue', 'TED', 'TEDTalks', 'Talks',...",David Pogue: Simplicity sells,79874
2,2/26/2006,Greening the ghetto | Majora Carter,"['greening', 'ghetto', 'majora', 'carter']",gQ-cZRmHfs4,majora carter,Majora Carter: Greening the ghetto,1,Activist for environmental justice,majora carter greening the ghetto,majoracartergreeningthegh,...,1697550,TEDtalksDirector,4.860330,['Nonprofits & Activism'],36.0,1156,995.0,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",Greening the ghetto | Majora Carter,130532
3,2/2/2006,Why we do what we do | Tony Robbins,"['tony', 'robbins']",Cpc-t-Uwv1I,tony robbins,Tony Robbins: Why we do what we do,1,Life coach; expert in leadership psychology,tony robbins why we do what we do,tonyrobbinswhywedowhatwed,...,20685401,TEDtalksDirector,4.810614,['Science & Technology'],3003.0,1351,60423.0,"['Tony Robbins', 'TED', 'TEDTalks', 'emotion',...",Why we do what we do | Tony Robbins,11424257
4,2/24/2006,Letting go of God | Julia Sweeney,"['letting', 'go', 'god', 'julia', 'sweeney']",OtIyx687ytk,julia sweeney,Julia Sweeney: Letting go of God,1,"Actor, comedian, playwright",julia sweeney letting go of god,juliasweeneylettinggoofgo,...,3769987,TEDtalksDirector,4.639640,['Comedy'],430.0,1038,4343.0,"['TED', 'TEDTalks', 'Talks', 'atheist', 'athei...",Letting go of God | Julia Sweeney,448541
5,2/25/2006,A life of purpose | Rick Warren,"['life', 'purpose', 'rick', 'warren']",640BQNxB5mc,rick warren,Rick Warren: A life of purpose,1,"Pastor, author",rick warren a life of purpose,rickwarrenalifeofpurpose,...,3095993,TEDtalksDirector,4.175432,['Education'],1168.0,1308,4498.0,"['Rick', 'Warren', 'ted', 'tedtalks', 'Christi...",A life of purpose | Rick Warren,605052
6,2/23/2006,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,Nicholas Negroponte: One Laptop per Child,1,Tech visionary,nicholas negroponte one laptop per child two ...,nicholasnegroponteonelapt,...,358304,TEDtalksDirector,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
7,12/12/2007,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,"Nicholas Negroponte: One Laptop per Child, two...",1,Tech visionary,nicholas negroponte one laptop per child two ...,nicholasnegroponteonelapt,...,398643,TEDtalksDirector,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
8,2/23/2006,Sirena Huang: An 11-year-old's magical violin,"['sirena', 'huang', 'year', 'old', 'magical', ...",#NAME?,sirena huang,Sirena Huang: An 11-year-old's magical violin,1,Violinist,sirena huang an 11yearold's magical violin,sirenahuangan11yearoldsma,...,2702470,TEDtalksDirector,4.850458,['Music'],147.0,1527,3785.0,"['Sirena', 'Huang', 'TED', 'TEDTalks', 'Talks'...",Sirena Huang: An 11-year-old's magical violin,514182
9,2/26/2004,"Jennifer Lin: Improvising on piano, aged 14","['jennifer', 'lin', 'improvising', 'piano', 'a...",UU0MX8epDro,jennifer lin,"Jennifer Lin: Improvising on piano, aged 14",1,"Pianist, composer",jennifer lin improvising on piano aged 14,jenniferlinimprovisingonp,...,1628912,TEDtalksDirector,4.834970,['Music'],21.0,1491,488.0,"['Jennifer', 'Lin', 'TED', 'TEDTalks', 'Talks'...","Jennifer Lin: Improvising on piano, aged 14",73891


In [67]:
final.drop(['talk_slice','uploader_id','talk3'],inplace = True,axis=1)

In [68]:
final

Unnamed: 0,film_dt,fulltitle,fulltitle_compare,id,main_speaker,name,num_speaker,speaker_occupation,ted_avlb_lang,ted_duration,...,ted_title,ted_views,youtube_avg_ratings,youtube_categories,youtube_dislikes,youtube_duration,youtube_likes,youtube_tags,youtube_title,youtube_views
0,2/25/2006,Averting the climate crisis | Al Gore,"['averting', 'climate', 'crisis', 'al', 'gore']",rDiGYuQicpA,al gore,Al Gore: Averting the climate crisis,1,Climate advocate,43,977,...,averting the climate crisis,3200520,4.046632,['Nonprofits & Activism'],184.0,1017,588.0,"['Al', 'Gore', 'TED', 'TEDTalks', 'Talks', 'cl...",Averting the climate crisis | Al Gore,162022
1,2/24/2006,David Pogue: Simplicity sells,"['david', 'pogue', 'simplicity', 'sell']",NEjZt0y6OOw,david pogue,David Pogue: Simplicity sells,1,Technology columnist,26,1286,...,simplicity sells,1636292,4.690756,['Science & Technology'],46.0,1326,549.0,"['David', 'Pogue', 'TED', 'TEDTalks', 'Talks',...",David Pogue: Simplicity sells,79874
2,2/26/2006,Greening the ghetto | Majora Carter,"['greening', 'ghetto', 'majora', 'carter']",gQ-cZRmHfs4,majora carter,Majora Carter: Greening the ghetto,1,Activist for environmental justice,35,1116,...,greening the ghetto,1697550,4.860330,['Nonprofits & Activism'],36.0,1156,995.0,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",Greening the ghetto | Majora Carter,130532
3,2/2/2006,Why we do what we do | Tony Robbins,"['tony', 'robbins']",Cpc-t-Uwv1I,tony robbins,Tony Robbins: Why we do what we do,1,Life coach; expert in leadership psychology,36,1305,...,why we do what we do,20685401,4.810614,['Science & Technology'],3003.0,1351,60423.0,"['Tony Robbins', 'TED', 'TEDTalks', 'emotion',...",Why we do what we do | Tony Robbins,11424257
4,2/24/2006,Letting go of God | Julia Sweeney,"['letting', 'go', 'god', 'julia', 'sweeney']",OtIyx687ytk,julia sweeney,Julia Sweeney: Letting go of God,1,"Actor, comedian, playwright",31,992,...,letting go of god,3769987,4.639640,['Comedy'],430.0,1038,4343.0,"['TED', 'TEDTalks', 'Talks', 'atheist', 'athei...",Letting go of God | Julia Sweeney,448541
5,2/25/2006,A life of purpose | Rick Warren,"['life', 'purpose', 'rick', 'warren']",640BQNxB5mc,rick warren,Rick Warren: A life of purpose,1,"Pastor, author",31,1262,...,a life of purpose,3095993,4.175432,['Education'],1168.0,1308,4498.0,"['Rick', 'Warren', 'ted', 'tedtalks', 'Christi...",A life of purpose | Rick Warren,605052
6,2/23/2006,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,Nicholas Negroponte: One Laptop per Child,1,Tech visionary,25,1057,...,one laptop per child,358304,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
7,12/12/2007,"Nicholas Negroponte: One Laptop per Child, two...","['nicholas', 'negroponte', 'one', 'laptop', 'p...",y_TKjfgjiQs,nicholas negroponte,"Nicholas Negroponte: One Laptop per Child, two...",1,Tech visionary,18,1000,...,one laptop per child two years on,398643,4.777778,['Science & Technology'],12.0,993,204.0,"['Nicholas', 'Negroponte', 'ted', 'tedtalks', ...","Nicholas Negroponte: One Laptop per Child, two...",40460
8,2/23/2006,Sirena Huang: An 11-year-old's magical violin,"['sirena', 'huang', 'year', 'old', 'magical', ...",#NAME?,sirena huang,Sirena Huang: An 11-year-old's magical violin,1,Violinist,31,1481,...,an 11yearold's magical violin,2702470,4.850458,['Music'],147.0,1527,3785.0,"['Sirena', 'Huang', 'TED', 'TEDTalks', 'Talks'...",Sirena Huang: An 11-year-old's magical violin,514182
9,2/26/2004,"Jennifer Lin: Improvising on piano, aged 14","['jennifer', 'lin', 'improvising', 'piano', 'a...",UU0MX8epDro,jennifer lin,"Jennifer Lin: Improvising on piano, aged 14",1,"Pianist, composer",32,1445,...,improvising on piano aged 14,1628912,4.834970,['Music'],21.0,1491,488.0,"['Jennifer', 'Lin', 'TED', 'TEDTalks', 'Talks'...","Jennifer Lin: Improvising on piano, aged 14",73891


In [69]:
final.columns

Index(['film_dt', 'fulltitle', 'fulltitle_compare', 'id', 'main_speaker',
       'name', 'num_speaker', 'speaker_occupation', 'ted_avlb_lang',
       'ted_duration', 'ted_event', 'ted_nof_comments', 'ted_published_dt',
       'ted_ratings', 'ted_tags', 'ted_talks_related', 'ted_title',
       'ted_views', 'youtube_avg_ratings', 'youtube_categories',
       'youtube_dislikes', 'youtube_duration', 'youtube_likes', 'youtube_tags',
       'youtube_title', 'youtube_views'],
      dtype='object')

In [70]:
final.to_csv('data1.csv')