## Get list of PyCon 2014 videos

In [1]:
import feedparser
pycon_feed_url = "http://pyvideo.org/category/50/pycon-us-2014/rss"
pycon_feed = feedparser.parse(pycon_feed_url)

In [2]:
import numpy as np
import pandas as pd

videos = []
links = pycon_feed.entries[0]['links']

for entry in pycon_feed.entries:
    links = entry['links']
    try:
        video = {
            'title': entry['title'],
            'author': entry['author'],
            'link': entry['link'],
            'summary': entry['summary'],
            'youtube_url': ([link for link in links if link.type == 'video/flv'])[0].href
        }

        videos.append(video)
    except (IndexError, KeyError):
        print("No video or key error for: %s" % video['title'])
            
videos = pd.DataFrame(videos)
print("Got %s videos from RSS" % len(videos))

No video or key error for: Cheap Helicopters In My Living Room
No video or key error for: Know Thy Neighbor: Scikit and the K-Nearest Neighbor Algorithm
No video or key error for: Know Thy Neighbor: Scikit and the K-Nearest Neighbor Algorithm
No video or key error for: Know Thy Neighbor: Scikit and the K-Nearest Neighbor Algorithm
No video or key error for: Pushing Python: Building a High Throughput, Low Latency System
No video or key error for: Let's Learn Twisted Python
No video or key error for: How to formulate a (science) problem and analyze it using Python code
Got 133 videos from RSS


In [3]:
videos.head(2)

Unnamed: 0,author,link,summary,title,youtube_url
0,Niko Skrypnik,http://www.pyvideo.org/video/2676/2d3d-graphic...,"<p>Abstract</p>\nGames, application with rich ...",2D/3D graphics with Python on mobile platforms,http://www.youtube.com/watch?v=Y6pLr11yf-4
1,Julien Phalip,http://www.pyvideo.org/video/2660/advanced-tec...,<p>Abstract</p>\nThis talk will present multip...,Advanced techniques for Web functional testing,http://www.youtube.com/watch?v=St9cL47_1GI


## Download Subtitles from YouTube

In [5]:
import youtube_dl

### Download subtitles and add youtube_id column

In [6]:
videos['youtube_id'] = None

ydl_opts = {
    #'listsubtitles': True,
    'outtmpl': '/work/subtitles/%(id)s',
    'writesubtitles': True,
    'writeautomaticsub': True,
    'skip_download': True,
    'no_warnings': True,
    'quiet': True
}

for video_ix in videos.index:
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        url = videos.ix[video_ix].youtube_url
        videos['youtube_id'][videos.index == video_ix] = ydl.extract_info(url)['id']


In [8]:
videos.head(2)

Unnamed: 0,author,link,summary,title,youtube_url,subtitles,youtube_id
0,Niko Skrypnik,http://www.pyvideo.org/video/2676/2d3d-graphic...,"<p>Abstract</p>\nGames, application with rich ...",2D/3D graphics with Python on mobile platforms,http://www.youtube.com/watch?v=Y6pLr11yf-4,,Y6pLr11yf-4
1,Julien Phalip,http://www.pyvideo.org/video/2660/advanced-tec...,<p>Abstract</p>\nThis talk will present multip...,Advanced techniques for Web functional testing,http://www.youtube.com/watch?v=St9cL47_1GI,,St9cL47_1GI


### Put subtitles in subtitles column

#### SRT Format
1. A number indicating which subtitle it is in the sequence.
2. The time that the subtitle should appear on the screen, and then disappear.
3. The subtitle itself.
4. A blank line indicating the start of a new subtitle.

<pre>
1
00:00:06,759 --> 00:00:09,559
thank you guys let's start first ball
good afternoon

2
00:00:09,559 --> 00:00:14,070
by from there's on my name is really
cause great nique
</pre>


In [9]:
videos['subtitles'] = None

for video_ix in videos.index:
    youtube_id = videos.ix[video_ix].youtube_id
    
    next_item = 'number'
    captions = []
    try:
        path = '/work/subtitles/%s.en.srt' % youtube_id
        with open(path, 'r') as f:
            for line in f.readlines():
                if next_item == 'number':
                    next_item = 'timecode'
                elif next_item == 'timecode':
                    next_item = 'content'
                elif next_item == 'content':
                    captions.append(line)
                    
                if line == '\n':
                    next_item = 'number'
                    
    except FileNotFoundError:
        # print('Cannot find file: %s' % path)
        pass

    
    videos['subtitles'][videos.index == video_ix] = ' '.join(captions)


In [10]:
videos.head(2)

Unnamed: 0,author,link,summary,title,youtube_url,subtitles,youtube_id
0,Niko Skrypnik,http://www.pyvideo.org/video/2676/2d3d-graphic...,"<p>Abstract</p>\nGames, application with rich ...",2D/3D graphics with Python on mobile platforms,http://www.youtube.com/watch?v=Y6pLr11yf-4,thank you guys let's start first ball\n good a...,Y6pLr11yf-4
1,Julien Phalip,http://www.pyvideo.org/video/2660/advanced-tec...,<p>Abstract</p>\nThis talk will present multip...,Advanced techniques for Web functional testing,http://www.youtube.com/watch?v=St9cL47_1GI,good afternoon everyone\n \n now we have my fr...,St9cL47_1GI


In [119]:
videos_with_subtitles = videos[(videos['subtitles'] != "")].copy()
print("Got %s videos left out of %s" % (len(videos_with_subtitles), len(videos)))

Got 79 videos left out of 133


## Data Science!

In [120]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [172]:
num_of_clusters = 10
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(videos_with_subtitles['subtitles'])
print("Number of features (unique words) extracted: %s" % len(vectorizer.get_feature_names()))

Number of features (unique words) extracted: 16120


In [183]:
km = KMeans(n_clusters=num_of_clusters, init='k-means++', n_init=1, verbose=1)
km.fit(X)

Initialization complete
Iteration  0, inertia 980803.000
Iteration  1, inertia 638831.155
Iteration  2, inertia 630379.434
Converged at iteration 2


KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1)

In [184]:
videos_with_subtitles['cluster'] = km.labels_
for cluster in range(num_of_clusters):
    titles = videos_with_subtitles[videos_with_subtitles['cluster'] == cluster]['title'].values
    print("Cluster: %s " % cluster)
    for title in titles:
        print("\t%s" %title)
    
    


Cluster: 0 
	2D/3D graphics with Python on mobile platforms
	Advanced techniques for Web functional testing
	A Scenic Drive through the Django Request-Response Cycle
	Developing Flask Extensions
	Know Thy Neighbor: Scikit and the K-Nearest Neighbor Algorithm
	Multi-factor Authentication - Possession Factors
	PostgreSQL is Web Scale (Really :) )
	Pushing Python: Building a High Throughput, Low Latency System
	Quick Wins for Better Website Security
	REST is not enough: Using Push Notifications to better support your mobile clients
	So You Want to Build an API?
	Advanced methods for creating decorators
	All Your Ducks In A Row: Data Structures in the Standard Library and Beyond
	Blending art, technology, and light, Python for interactive and real time
	Castle Anthrax: Dungeon Generation Techniques
	Computer science fundamentals for self-taught programmers
	For Lack of a Better Name(server): DNS Explained.
	Hello Physical World: A Crash Course on the Internet of Things
	Kneel And Disconnec