In [1]:
import pandas as pd
import numpy as np
import data
import networkx as nx
from networkx.algorithms import community
import uuid
import json

ModuleNotFoundError: No module named 'data'

The input data are the relationships betweeen viewers and content. In this case the program content has been limited to program series and excluded movies, live news, special events, sports events, etc. 

The data has also been limited to on-demand viewing as opposed to live television viewing. This has been done because we want to focus on active content consumption (as opposed to 'channel surfing') because our larger goals are to model viwer and content interactions that we might see on streaming services.

Start by loading our viewing data. This data has been engineered upstream and is a sample of a much larger dataset. Logically, it is a table of viewer-content engagement. Each row has one content (i.e. program) and one viewer. Because program to viewer relationships are many-to-many both programs and viewers will repeat within the dataset.

_*Note that this dataset is a small sample of the actual dataset used so results here are not as representitive of the actual viewer / content universe_

In [None]:
viewing = pd.read_csv('./input/viewing.csv')

list all columns and show a subset of the data that illustrates what is in it

In [None]:
viewing.columns

In [None]:
viewing[['personkey','age','gender','householdincome','contentsk','programname','engagement']].head()

'personkey' and 'contentsk' are keys for viewers and programs respectively. 'engagement' is a refined feature that serves as a relative measure of how interested the viewer is with the program; based on what proportion of the content that the viewer consumes. For exmple, if the viewer watches every episode of Game of Thrones and watches 100% of the duration of those episodes, the engagement would be 100.

We're going to load the data into a network graph, so we need to do some data preparation

In [None]:
# nodes will be programs and viewers
viewing['personkey'] = 'V' + viewing['personkey'].astype(str)
viewing['contentsk'] = 'P' + viewing['contentsk'].astype(str)

In [None]:
# let's build a unique list of programs
programs = viewing[['contentsk','programname','program_total_engagement']].drop_duplicates()

In [None]:
# build the program node attributes. these attributes will be saved within the graph
attribs = [None] * programs.shape[0]
for i in range(0, programs.shape[0]):
    attribs[i]={'type':'program', 'programname':programs.iloc[i]['programname'],'program_total_engagement':programs.iloc[i]['program_total_engagement']}

In [None]:
# now build the actual program node tuples that the graph requires that combine the keys and the attributes
df_nodes = pd.DataFrame({'contentsk':programs['contentsk'],'attribs':attribs})
node_tuples = [tuple(x) for x in df_nodes[['contentsk','attribs']].values]

In [None]:
# Create a new empty graph and add the program nodes
G = nx.Graph()
G.add_nodes_from(node_tuples)

In [None]:
# now let's prepare the viewer nodes
viewer_cols = ['personkey','age', 'gender',
       'race', 'person_education', 'person_education_level', 'countysize',
       'county_size_level', 'householdincome', 'languageofhousehold',
       'headofhoushold_education_level', 'householdsize', 'numberofchildren',
       'numberofadults', 'numberofincomes', 'hascat', 'hasdog']

# viewers
viewers = viewing[viewer_cols].drop_duplicates()
viewers['person_education'] = viewers.race.fillna('')

In [None]:
# build the viewer node attributes
attribs = [None] * viewers.shape[0]
for i in range(0, viewers.shape[0]):
    attribs[i]={'type':'viewer'
    , 'age':viewers.iloc[i]['age']
    , 'householdincome':viewers.iloc[i]['householdincome']
    , 'numberofchildren':viewers.iloc[i]['numberofchildren']
    , 'race':viewers.iloc[i]['race']
    , 'countysize':viewers.iloc[i]['countysize']
    , 'headofhoushold_education_level':viewers.iloc[i]['headofhoushold_education_level']
    , 'person_education_level':viewers.iloc[i]['person_education_level']
    }

In [None]:
# now build the node tuples
df_nodes = pd.DataFrame({'personkey':viewers['personkey'],'attribs':attribs})
node_tuples = [tuple(x) for x in df_nodes[['personkey','attribs']].values]

In [None]:
# add the viewer nodes to the graph
G.add_nodes_from(node_tuples)

In [None]:
# now we need to define the edges, or the connections between the viewer nodes and the program nodes
# we can save the engagement as an attribute of each edge
links = viewing[['personkey','contentsk', 'engagement']].drop_duplicates()

# like nodes, we add to the graph in bulk with tuples that look like this: (node1, node2, {'edge_attribute1': some_value, 'edge_attribute2': some_value})
attribs = [None] * links.shape[0]
for i in range(0, links.shape[0]):
    attribs[i] = {'engagement':links.iloc[i]['engagement']}

df_links = pd.DataFrame({'personkey':links['personkey'], 'contentsk':links['contentsk'], 'attribs':attribs})
link_tuples = [tuple(x) for x in df_links[['personkey','contentsk','attribs']].values]

# add the edges to the graph
G.add_edges_from(link_tuples)

Look at the graph summary statistics

In [None]:
print(nx.info(G))

Now that we have build the graph, we want to extract 'communities' from it to generate a rich taxonomy of viewer/content groups

There are various community generation algorithms (see refences). Based on experimentation, we will use the Clauset-Newman-Moore greedy modularity maximization. This algorithm works well because within the on-demand viewing we find that the networks have a relatively high degree of modularity, where there are sparse connections between nodes in different groups. This reflects that fact that generally, viewers will like certain groups of programming and not like other types of programming at all.

https://networkx.github.io/documentation/stable/reference/algorithms/community.html

https://en.wikipedia.org/wiki/Community_structure

Our overall approach will be the following:

1. Extract the initial set of communities.  This will normally result in a small number of large communities and several smaller ones.

2. Recursively treat each community as an isolated graph and contiune to generate sub-comminities within them.  

This will generate a taxonomy of content and viewers similiar to biological taxonomies of species, music genres, etc.

First we have to define some functions that allow for recursion and setting attributes on each generated community 'group'.

In [None]:
# this function pulls the info out of the community and creates a community group object with some statistics about it
def build_community_group(group_list):
    
#    group_list = list(map(list, group_list))[0]
    
    contentsks = list(filter(lambda x: x.startswith('P'), group_list))
    personkeys = list(filter(lambda x: x.startswith('V'), group_list))

    group_viewing = viewing[viewing['contentsk'].isin(contentsks) & viewing['personkey'].isin(personkeys)]
    
    # group_viewing = viewing[viewing['contentsk'].isin(contentsks[0]) & viewing['personkey'].isin(personkeys[0])]
    
    n_contentsks= len(group_viewing['contentsk'].drop_duplicates())
    n_personkeys = len(group_viewing['personkey'].drop_duplicates())
    
    group_viewers = viewing[viewing['personkey'].isin(personkeys)]
    
    pct_female = float(round(group_viewers[group_viewers['gender']=='F'].count()[0] / group_viewers.count()[0], 2))
    pct_male = float(round(group_viewers[group_viewers['gender']=='M'].count()[0] / group_viewers.count()[0],2))
    pct_children = float(round(group_viewers[group_viewers['numberofchildren']>0].count()[0] / group_viewers.count()[0],2))

    # program_engagement = group_viewing.groupby('programname')['engagement'].sum().to_frame().reset_index().sort_values(by=['engagement'], ascending=False)
    
    # label the group by using the the top n programs based on combined ranking of engagement and number of viewers
    program_engagement = group_viewing.groupby(['contentsk','programname'])['engagement'].sum().reset_index()
    program_engagement['respondents'] = group_viewing.groupby(['contentsk','programname'])['personkey'].count().rank(ascending=False).tolist()
    # program_engagement['combined_rank'] = program_engagement.engagement + program_engagement.respondents
    # program_engagement = program_engagement.sort_values(by=['combined_rank'])
    program_engagement = program_engagement.sort_values(by=['engagement'],ascending=False)
    
    group = {
    'top_programs':"|".join(program_engagement['programname'][0:min(2,len(contentsks))])
    , 'program_count':n_contentsks
    , 'respondents':n_personkeys
    , 'viewers_thousands':int(group_viewing.count()[0]/1000)
    , 'avg_engagement':round(group_viewing['engagement'].mean(),0)
    , 'median_engagement':round(group_viewing['engagement'].median(),0)
    , 'median_age':group_viewers['age'].median()
    , 'median_hh_income':group_viewers['householdincome'].median()
    , 'pct_children':pct_children 
    , 'pct_female':pct_female
    , 'pct_male':pct_male
    , 'all_programs':'|'.join(program_engagement['programname']) 
    , 'contentsks':program_engagement['contentsk'].tolist()
    , 'personkeys':personkeys
    }
    
    return(group)

In [None]:
# this function allows for recusivity so that larger communites can be broken down to smaller ones 
# to build a hierarchy
def build_community_groups(raw_communities, parent_group_id = "", group_level = 0):
    
    print("parent_group_id = {0}, len = {1}".format(parent_group_id, len(raw_communities)))
    
    community_groups = []
    i = 0
    
    for raw_community in raw_communities:
        community_group = build_community_group(raw_community)
        community_groups.append(community_group)
        i = i + 1

    community_groups = list(map(lambda x: build_community_group(x), raw_communities))
    
    df_community_groups = pd.DataFrame(community_groups, columns=['top_programs'
                                                ,'program_count'
                                                ,'respondents'
                                                ,'viewers_thousands'
                                                ,'avg_engagement'
                                                ,'median_engagement'
                                                ,'median_age'
                                                ,'median_hh_income'
                                                ,'pct_children'
                                                ,'pct_female'
                                                ,'pct_male'
                                                , 'all_programs'
                                                , 'contentsks'
                                                , 'personkeys'
                                                ])
    
    df_community_groups['group_id'] = [str(uuid.uuid4()).split('-')[0] for _ in range(len(df_community_groups.index))]
    
    df_community_groups['parent_group_id'] = parent_group_id
    
    df_community_groups['group_level'] = group_level
    
    # if the community has a certain amount of viewers and programs then continue to break it down
    x = df_community_groups[(df_community_groups['respondents']>20) & (df_community_groups['program_count']>10)]
    
    sub_community_groups = list()
    
    if x.shape[0] > 0:
      
        for i in range(0, x.shape[0] - 1):
            contentsks = x.loc[x.index[i], 'contentsks']
            personkeys = x.loc[x.index[i], 'personkeys']
            group_id = x.loc[x.index[i], 'group_id']
            
            subG = G.subgraph(contentsks + personkeys)  
            sub_communities = community.greedy_modularity_communities(subG)
            sub_community_groups.append(build_community_groups(sub_communities, parent_group_id = group_id, group_level = group_level + 1))  
            
        if len(sub_community_groups) > 0:
            df_community_groups = df_community_groups.append(pd.concat(sub_community_groups))

    
    return(df_community_groups)

In [None]:
# first let's create our iniital top-level viewer/content communities 
# this can take a while depending on your local machine's resources
initial_communities = community.greedy_modularity_communities(G)

In [None]:
# now let's recurse through the communities and break them down to build a detailed taxonomy
community_groups = build_community_groups(initial_communities)

In [None]:
# let's drop the smaller communities that have less than 6 viewers in them
community_groups = community_groups[(community_groups['respondents']>5)]

In [None]:
# take a peek at the table and look at the range of their respective attributes
community_groups.head()

In [None]:
community_groups.describe()

Now we'll save the following datasets that we will reloade back into our database so that we can join with other data to do more analysis on our viewing taxonomy; or the output files can be used for analysis in other notebooks.

In [None]:
community_groups[['top_programs'
,'program_count'
,'respondents'
,'viewers_thousands'
,'avg_engagement'
,'median_engagement'
,'median_age'
,'median_hh_income'
,'pct_children'
,'pct_female'
,'pct_male'
,'personkeys'
,'contentsks'
,'group_id'
,'parent_group_id'
,'group_level'
,'all_programs']].to_csv("./output/viewing_communities.csv", index = False, index_label = False)


In [None]:
viewers.to_csv('./output/viewers.csv', index = False)

In [None]:
programs.to_csv('./output/programs.csv', index = False)