In [1]:
# Need to force reloading of modules before execution
%load_ext autoreload
%autoreload 2

import pickle

%aimport -pickle

## Import external variables / functions

In [2]:
seasons = pickle.load( open( "wiki_scrape.p", "rb" ) )
voteweights = pickle.load( open( "process_votes.p", "rb" ) )

## Functions: Episode Scores

In [3]:
%%writefile episode_scores.py

import process_votes
import make_graphs
import network
import numpy as np
import pickle

def get_season_stats(votes):
    num_episodes = sum(votes.iloc[0, :] != "Jury Vote")
    num_finalists = votes.shape[1] - num_episodes
    jury_votes = votes[votes.columns[-num_finalists:]].fillna('')
    num_jurors = sum(
        [sum( [i.strip() == c for i in jury_votes[c]] ) 
         for c in jury_votes]
    )
    return {'num_episodes':  num_episodes,
            'num_finalists': num_finalists,
            'num_jurors'   : num_jurors}
    
def scores_from_votes(votes):
    
    # Turn vote matrix into graph object
    V = process_votes.compare_votes(votes)
    
    G = make_graphs.make_graph(V)
    
    # Calculate scores
    C = network.centrality_scores(votes, G)
    
    # Binary classification of winners (1) and losers (0)
    C['place'] = np.where(C['place'] == 1, 1, 0)
                
    # Return dataframe
    return C

def map_prct_to_episode(prct, num_episodes):
    return int( round(prct * num_episodes, 0) )

def truncate_votes_thru_episode(votes, episode):
    # drop later episodes
    eliminated_players = votes.columns[:episode]
    # eliminating rows causes errors ...
    # rows = votes.index[
    #     [i not in eliminated_players for i in votes.index]]
    # return votes.loc[rows, eliminated_players]
    return votes[eliminated_players]

def process_season(votes, time_line_prct):
    season_stats = get_season_stats(votes)
    season_stats['scores'] = {}
    num_episodes = season_stats['num_episodes']
    for prct in time_line_prct:
        # print prct
        thru_episode = map_prct_to_episode(prct, num_episodes)
        votes_trunc = truncate_votes_thru_episode(
            votes, thru_episode)
        scores = scores_from_votes(votes_trunc)
        season_stats['scores'][prct] = scores
    return season_stats

def process_all_seasons(seasons, time_line_prct, save_to_disk=True):
    for season in seasons.keys():
        print season
        current_season = seasons[season]
        votes = current_season['votes']
        current_season['features'] = process_season(votes, time_line_prct)
    
    if save_to_disk:
        pickle.dump(seasons, open( "episode_scores.p", "wb" ) )
    # no return - mutates seasons

Overwriting episode_scores.py


In [4]:
from episode_scores import *

## Test Cases

### Full season

In [5]:
test_season = seasons['Borneo']
test_votes = test_season['votes']
scores_from_votes(test_votes)

  df = df.sort(['page', 'eig', 'deg'], ascending=[0, 0, 0])


Unnamed: 0,name,deg,close,btw,eig,page,place
4,Rudy,6,0.6,0.026,0.507,0.108,0
6,Susan,6,0.6,0.026,0.493,0.105,0
0,Richard,6,0.6,0.026,0.469,0.095,1
10,Kelly,8,0.652,0.271,0.354,0.093,0
14,Colleen,7,0.625,0.095,0.034,0.09,0
15,Jenna,11,0.789,0.405,0.093,0.087,0
13,Sean,6,0.6,0.026,0.359,0.074,0
1,Gervase,6,0.556,0.01,0.015,0.067,0
3,Gretchen,6,0.556,0.01,0.017,0.061,0
7,Joel,5,0.517,0.0,0.012,0.051,0


### Partial Season

In [6]:
scores_from_votes(truncate_votes_thru_episode(test_votes, 5))

Unnamed: 0,name,deg,close,btw,eig,page,place
4,Rudy,5,0.363,0.019,0.503,0.086,0
6,Susan,5,0.363,0.019,0.503,0.086,0
10,Kelly,5,0.363,0.095,0.189,0.069,0
3,Gretchen,5,0.333,0.0,-0.0,0.067,0
7,Joel,5,0.333,0.0,-0.0,0.067,0
14,Colleen,5,0.333,0.0,-0.0,0.067,0
15,Jenna,5,0.333,0.0,-0.0,0.067,0
0,Richard,5,0.363,0.019,0.374,0.063,1
5,B.B.,1,0.067,0.0,0.0,0.063,0
9,Greg,1,0.067,0.0,0.0,0.063,0


## Score all seasons

In [7]:
# In order to normalize seasons we will use a time line
# based on the percentage of season completion
n = 8  # tried 10 first and that didn't work
time_line_prct = [i/n for i in np.arange(n) + 1.]
process_all_seasons(seasons, time_line_prct)

Palau
Tocantins
Borneo
Panama
Cambodia
Blood_vs._Water
Marquesas
Pearl_Islands
Vanuatu
The_Australian_Outback
Heroes_vs_Villains
Guatemala
China
Worlds_Apart
Thailand
The_Amazon
Cagayan
South_Pacific
One_World
Philippines
Caramoan
Gabon
Micronesia
Samoa
All-Stars
Nicaragua
Africa
San_Juan_del_Sur
Cook_Islands
Redemption_Island
Fiji


## Appendix

In [8]:
# %load https://gist.github.com/ajp619/ddaa0f35627b066ef528/raw/cbbd6c6c1cad286ba5a358b93fd94eddede7c4ba/qtutil.py
# silly utility to launch a qtconsole if one doesn't exist

consoleFlag = True
consoleFlag = False  # Turn on/off by commenting/uncommenting this line

import psutil

def returnPyIDs():
    pyids = set()
    for pid in psutil.pids():
        try:
            if "python" in psutil.Process(pid).name():
                pyids.add(pid)
        except:
            pass
    return pyids

def launchConsole():
    before_pyids = returnPyIDs()
    %qtconsole
    after_pyids = returnPyIDs()
    newid = after_pyids.difference(before_pyids)
    assert len(newid) == 1
    return list(newid)[0]

try:
    print qtid
except NameError:
    if consoleFlag:
        qtid = launchConsole()
        print qtid
    
if consoleFlag and (qtid not in returnPyIDs()):
    qtid = launchConsole()
    print qtid

In [9]:
# %load https://gist.github.com/ajp619/7dd388315fc824208654/raw/81be07b0e793208641182032e074dbe39bbfa08e/pyprint
def pyprint(myfile):
    from pygments import highlight
    from pygments.lexers import PythonLexer
    from pygments.formatters import HtmlFormatter
    import IPython

    with open(myfile) as f:
        code = f.read()

    formatter = HtmlFormatter()
    return IPython.display.HTML('<style type="text/css">{}</style>{}'.format(
        formatter.get_style_defs('.highlight'),
        highlight(code, PythonLexer(), formatter)))

In [10]:
pyprint("process_votes.py")