In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Accessing the Coursera Data

In [3]:
import urllib2
import pandas as pd
import json

In [4]:
coursesInfo = urllib2.urlopen(
    'https://api.coursera.org/api/catalog.v1/courses?fields=shortName,name,language&includes=universities,categories')
coursesData = json.load(coursesInfo)
coursesData = coursesData['elements'] # elements is the overarching dictionary

In [5]:
print coursesData[0] # Universities and categories are indexes from another list 

{u'shortName': u'perceptivehunting', u'links': {u'universities': [65], u'categories': [8, 10, 19, 20]}, u'id': 2163, u'language': u'en', u'name': u'The Land Ethic Reclaimed: Perceptive Hunting, Aldo Leopold, and Conservation'}


In [6]:
universitiesInfo = urllib2.urlopen(
    'https://api.coursera.org/api/catalog.v1/universities?fields=name,locationCountry')
universitiesData = json.load(universitiesInfo)
universitiesData = universitiesData['elements']

In [7]:
print universitiesData[65]

{u'shortName': u'duke', u'locationCountry': u'US', u'id': 7, u'links': {}, u'name': u'Duke University'}


In [8]:
categoriesInfo = urllib2.urlopen(
    'https://api.coursera.org/api/catalog.v1/categories?fields=shortName,name')
categoriesData = json.load(categoriesInfo)
categoriesData = categoriesData['elements']

In [9]:
print categoriesData[8], categoriesData[10], categoriesData[19], categoriesData[20]

{u'shortName': u'law', u'id': 21, u'links': {}, u'name': u'Law'} {u'shortName': u'business', u'id': 13, u'links': {}, u'name': u'Business & Management'} {u'shortName': u'teacherpd', u'id': 26, u'links': {}, u'name': u'Teacher Professional Development'} {u'shortName': u'physics', u'id': 23, u'links': {}, u'name': u'Physics'}


## Storing the Data in DataFrames

In [10]:
courses_df = pd.DataFrame()

In [11]:
courses_df['name'] = map(lambda course: course['name'], coursesData)
courses_df['shortName'] = map(lambda course: course['shortName'], coursesData)
courses_df['language'] = map(lambda course: course['language'], coursesData)
courses_df['universitiesIds'] = map(lambda course: course['links']['universities'] if 
                                 'universities' in course['links'] else [], coursesData)
courses_df['categoriesIds'] =  map(lambda course: course['links']['categories'] if 
                                 'categories' in course['links'] else [], coursesData)

In [12]:
print courses_df.head()

                                                name           shortName  \
0  The Land Ethic Reclaimed: Perceptive Hunting, ...   perceptivehunting   
1   Contraception: Choices, Culture and Consequences       contraception   
2     Introduction to Computational Arts: Processing  compartsprocessing   
3            Introduction to Programming with MATLAB              matlab   
4                    Experimentation for Improvement         experiments   

  language universitiesIds    categoriesIds  
0       en            [65]  [8, 10, 19, 20]  
1       en            [10]           [3, 8]  
2       en           [117]   [1, 4, 18, 22]  
3       en            [37]         [12, 15]  
4       en           [148]   [4, 5, 15, 16]  


In [13]:
universities_df = pd.DataFrame()

In [14]:
universities_df['id'] = map(lambda universities: universities['id'], universitiesData)
universities_df['name'] = map(lambda universities: universities['name'], universitiesData)
universities_df['locationCountry'] = map(lambda universities: universities['locationCountry'], universitiesData)

universities_df = universities_df.set_index('id') # so we can retrieve info with university id from courses_df

In [15]:
print universities_df.head()

                                        name locationCountry
id                                                          
234                Xi'an Jiaotong University              CN
120                 University of New Mexico              US
10   University of California, San Francisco              US
56      University of California, Santa Cruz              US
24            Hebrew University of Jerusalem                


In [16]:
categories_df = pd.DataFrame()
categories_df['id'] = map(lambda category_data: category_data['id'], categoriesData)
categories_df['name'] = map(lambda category_data: category_data['name'], categoriesData)
categories_df['shortName'] = map(lambda category_data: category_data['shortName'], categoriesData)

categories_df = categories_df.set_index('id')

In [17]:
print categories_df.head()

                       name  shortName
id                                    
5               Mathematics       math
10  Biology & Life Sciences    biology
24                Chemistry  chemistry
25  Energy & Earth Sciences     energy
14                Education  education


In [18]:
# Now we need to add columns to courses_df with the proper universities and categories
def id2Name(df, ids, feature):
    names = []
    for id in ids:
        try:
            names.append(df.loc[id][feature])
        except: continue
    return names

In [19]:
courses_df['universitiesNames'] = map(lambda ids: id2Name(universities_df, ids, 'name'), courses_df.universitiesIds)
courses_df['categoriesNames'] = map(lambda ids: id2Name(categories_df, ids, 'name'), courses_df.categoriesIds)

In [20]:
print courses_df.head()

                                                name           shortName  \
0  The Land Ethic Reclaimed: Perceptive Hunting, ...   perceptivehunting   
1   Contraception: Choices, Culture and Consequences       contraception   
2     Introduction to Computational Arts: Processing  compartsprocessing   
3            Introduction to Programming with MATLAB              matlab   
4                    Experimentation for Improvement         experiments   

  language universitiesIds    categoriesIds  \
0       en            [65]  [8, 10, 19, 20]   
1       en            [10]           [3, 8]   
2       en           [117]   [1, 4, 18, 22]   
3       en            [37]         [12, 15]   
4       en           [148]   [4, 5, 15, 16]   

                           universitiesNames  \
0          [University of Wisconsin–Madison]   
1  [University of California, San Francisco]   
2         [The State University of New York]   
3                    [Vanderbilt University]   
4                   

## Loading Coursera Links for Courses

In [21]:
# We will use https://www.sharedcount.com/ API to determine which courses are popular via social media
# First we need to create links for all our courses

courses_df['courseLink'] = map(lambda shortName: 'https://www.coursera.org/course/' + shortName, courses_df.shortName)

In [22]:
print courses_df.head()

                                                name           shortName  \
0  The Land Ethic Reclaimed: Perceptive Hunting, ...   perceptivehunting   
1   Contraception: Choices, Culture and Consequences       contraception   
2     Introduction to Computational Arts: Processing  compartsprocessing   
3            Introduction to Programming with MATLAB              matlab   
4                    Experimentation for Improvement         experiments   

  language universitiesIds    categoriesIds  \
0       en            [65]  [8, 10, 19, 20]   
1       en            [10]           [3, 8]   
2       en           [117]   [1, 4, 18, 22]   
3       en            [37]         [12, 15]   
4       en           [148]   [4, 5, 15, 16]   

                           universitiesNames  \
0          [University of Wisconsin–Madison]   
1  [University of California, San Francisco]   
2         [The State University of New York]   
3                    [Vanderbilt University]   
4                   

## Receving the SharedCount API

In [30]:
def getSharedCountData(url, API):
    socialMetrics = urllib2.urlopen('https://free.sharedcount.com/?url=' + url + '&apikey=' + API)
    return json.load(socialMetrics)

In [31]:
API = 'f7d83e560625f52290d127229a5733d620c1aaa7'
courses_df['SharedCountData'] = map(lambda url: getSharedCountData(url, API), courses_df.courseLink)

In [34]:
courses_df['facebookLikes'] = map(lambda socialMedia: socialMedia['Facebook']['like_count'], courses_df.SharedCountData)
courses_df['twitterTweets'] = map(lambda socialMedia: socialMedia['Twitter'], courses_df.SharedCountData)

In [36]:
print courses_df.head()

                                                name           shortName  \
0  The Land Ethic Reclaimed: Perceptive Hunting, ...   perceptivehunting   
1   Contraception: Choices, Culture and Consequences       contraception   
2     Introduction to Computational Arts: Processing  compartsprocessing   
3            Introduction to Programming with MATLAB              matlab   
4                    Experimentation for Improvement         experiments   

  language universitiesIds    categoriesIds  \
0       en            [65]  [8, 10, 19, 20]   
1       en            [10]           [3, 8]   
2       en           [117]   [1, 4, 18, 22]   
3       en            [37]         [12, 15]   
4       en           [148]   [4, 5, 15, 16]   

                           universitiesNames  \
0          [University of Wisconsin–Madison]   
1  [University of California, San Francisco]   
2         [The State University of New York]   
3                    [Vanderbilt University]   
4                   

In [45]:
impCols = ['name', 'universitiesNames', 'categoriesNames', 'facebookLikes', 'twitterTweets']
# Top Courses based on Facebook Likes
query = courses_df[courses_df.language == "en"] # we only want english classes
query = query.sort('facebookLikes', ascending=0).head()
query[impCols]

Unnamed: 0,name,universitiesNames,categoriesNames,facebookLikes,twitterTweets
643,Inquiry Science Learning: Perspectives and Pra...,[Rice University],"[Education, Teacher Professional Development]",297513,10
341,Using The Next Generation Science Standards fo...,[Rice University],"[Education, Teacher Professional Development]",297513,21
528,Diabetes - a Global Challenge,[University of Copenhagen],"[Medicine, Health & Society, Biology & Life Sc...",39104,199
315,Process Mining: Data science in Action,[Eindhoven University of Technology],"[Information, Tech & Design, Business & Manage...",15106,539
107,Equine Nutrition,[The University of Edinburgh],[Medicine],10638,544


In [61]:
# Top Statistics and Data Analysis Courses based on Facebook Likes
query = courses_df[courses_df.language == 'en']
query = query[map(lambda categories: 'Statistics and Data Analysis' in categories, query.categoriesNames)]
query = query.sort('facebookLikes', ascending=0).head()
query[impCols]

Unnamed: 0,name,universitiesNames,categoriesNames,facebookLikes,twitterTweets
315,Process Mining: Data science in Action,[Eindhoven University of Technology],"[Information, Tech & Design, Business & Manage...",15106,539
467,"Creativity, Innovation, and Change | 创意，创新, 与 变革",[The Pennsylvania State University],"[Computer Science: Theory, Economics & Finance...",5518,1229
753,Startup Engineering,[Stanford University],"[Computer Science: Software Engineering, Busin...",4019,2584
717,Maps and the Geospatial Revolution,[The Pennsylvania State University],"[Information, Tech & Design, Statistics and Da...",3926,1411
841,R Programming,[Johns Hopkins University],"[Information, Tech & Design, Statistics and Da...",3903,1008
