In [1]:
# Need to force reloading of modules before execution
%load_ext autoreload
%autoreload 2

import pickle

%aimport -pickle

## Import external variables / functions

In [2]:
seasons = pickle.load( open( "wiki_scrape.p", "rb" ) )
voteweights = pickle.load( open( "process_votes.p", "rb" ) )

## Functions: Episode Scores

In [3]:
%%writefile episode_scores.py

import process_votes
import make_graphs
import network
import numpy as np
import pickle
import pandas as pd

def get_season_stats(votes):
    num_episodes = sum(votes.iloc[0, :] != "Jury Vote")
    num_finalists = votes.shape[1] - num_episodes
    jury_votes = votes[votes.columns[-num_finalists:]].fillna('')
    num_jurors = sum(
        [sum( [i.strip() == c for i in jury_votes[c]] ) 
         for c in jury_votes]
    )
    return {'num_episodes':  num_episodes,
            'num_finalists': num_finalists,
            'num_jurors'   : num_jurors}

def votes_correct_against(votes):

    l = len(votes.columns)

    tally = pd.DataFrame(index=[votes.index], columns=['votes_correct', 'votes_against'])
    tally = tally.fillna(0)
    episodes = []
   
    for i in range(0,l):
        
        # Count episode numbers
        episodes.append(i)
        episodes_cumulative = pd.DataFrame(votes[votes.columns[episodes]])
        current = episodes_cumulative[episodes_cumulative.columns[i]]
        
        # TODO: Split / remove vote-overs
        # Current workaround ignores these entire votes
        if len(pd.DataFrame(current).columns) > 1:
            current = pd.Series(str(np.zeros(l)))

        # Remove whitespace in scraped values
        current = current.str.strip()
        
        # Votes correct (voted for eliminated player)
        eliminated = current.name
        vcdf = pd.DataFrame(current, columns=[str(eliminated)])
        try:
            correct_vote = vcdf[vcdf[eliminated] == eliminated].index
        except: 
            correct_vote=[]
        tally.loc[tally.index.isin(correct_vote), ['votes_correct']] = tally['votes_correct'] + 1
        
        # Votes against 
        va = current.value_counts()
        vadf = pd.DataFrame(va, columns=['votes'])
        tally.loc[tally.index.isin(vadf.index), ['votes_against']] = vadf['votes'] + tally['votes_against']

    return tally

def scores_from_votes(votes):
    
    eliminated_players = votes.columns
    
    # Turn vote matrix into graph object
    v = process_votes.compare_votes(votes)
    
    g = make_graphs.make_graph(v)
    
    # Calculate scores
    scores = network.centrality_scores(votes, g)
    
    # Add votes for and against
    vca = votes_correct_against(votes)
    scores = scores.join(vca, on='name')
    
    # Binary classification of winners (1) and losers (0)
    scores['place'] = np.where(scores['place'] == 1, 1, 0)
    
    # Rearrange columns
    scores = scores[['name','deg','close','btw','eig','page','votes_correct','votes_against','place']]
    
    # Filter out eliminated players
    fltr = [i.strip() not in eliminated_players for i in scores['name']]
    
    return scores.loc[fltr, :]

def map_prct_to_episode(prct, num_episodes):
    return int( round(prct * num_episodes, 0) )

def truncate_votes_thru_episode(votes, episode):
    # drop later episodes
    eliminated_players = votes.columns[:episode]
    # eliminating rows causes errors ...
    # rows = votes.index[
    #     [i not in eliminated_players for i in votes.index]]
    # return votes.loc[rows, eliminated_players]
    return votes[eliminated_players]

def process_season(votes, time_line_prct):
    season_stats = get_season_stats(votes)
    season_stats['scores'] = {}
    num_episodes = season_stats['num_episodes']
    for prct in time_line_prct:
        # print prct
        thru_episode = map_prct_to_episode(prct, num_episodes)
        votes_trunc = truncate_votes_thru_episode(
            votes, thru_episode)
        scores = scores_from_votes(votes_trunc)
        season_stats['scores'][prct] = scores
    return season_stats

def process_all_seasons(seasons, time_line_prct, save_to_disk=True):
    for season in seasons.keys():
        print season
        current_season = seasons[season]
        votes = current_season['votes']
        current_season['features'] = process_season(votes, time_line_prct)
    
    if save_to_disk:
        pickle.dump(seasons, open( "episode_scores.p", "wb" ) )
    # no return - mutates seasons

Overwriting episode_scores.py


In [4]:
from episode_scores import *

## Test Cases

### Full season

In [5]:
test_season = seasons['Borneo']
test_votes = test_season['votes']
p = 1
process_season(test_votes, [p])['scores'][p]

  df = df.sort(['page', 'eig', 'deg'], ascending=[0, 0, 0])


Unnamed: 0,name,deg,close,btw,eig,page,votes_correct,votes_against,place
10,Kelly,8,0.652,0.271,0.373,0.096,5,0,0
0,Richard,6,0.6,0.026,0.468,0.095,8,0,1


### Partial Season

In [6]:
p = 0.5
process_season(test_votes, [p])['scores'][p]

Unnamed: 0,name,deg,close,btw,eig,page,votes_correct,votes_against,place
15,Jenna,11,0.789,0.543,0.199,0.096,4,0,0
4,Rudy,6,0.6,0.026,0.486,0.095,5,0,0
6,Susan,6,0.6,0.026,0.486,0.095,5,0,0
10,Kelly,7,0.625,0.248,0.346,0.086,3,0,0
14,Colleen,6,0.556,0.01,0.082,0.085,3,0,0
0,Richard,6,0.6,0.026,0.428,0.08,4,0,1
13,Sean,6,0.6,0.026,0.338,0.066,3,0,0
1,Gervase,6,0.556,0.01,0.048,0.062,1,0,0


## Score all seasons

In [7]:
# In order to normalize seasons we will use a time line
# based on the percentage of season completion
n = 8  # tried 10 first and that didn't work
time_line_prct = [i/n for i in np.arange(n) + 1.]
process_all_seasons(seasons, time_line_prct)

Palau
Tocantins
Borneo
Panama
Cambodia
Blood_vs._Water
Marquesas
Pearl_Islands
Vanuatu
The_Australian_Outback
Heroes_vs_Villains
Guatemala
China
Worlds_Apart
Thailand
The_Amazon
Cagayan
South_Pacific
One_World
Philippines
Caramoan
Gabon
Micronesia
Samoa
All-Stars
Nicaragua
Africa
San_Juan_del_Sur
Cook_Islands
Redemption_Island
Fiji


###Random test

In [8]:
seasons['Tocantins']['features']['scores'][0.875]

Unnamed: 0,name,deg,close,btw,eig,page,votes_correct,votes_against,place
1,Stephen,10,0.704,0.074,0.465,0.116,9,0,0
6,J.T.,11,0.751,0.122,0.433,0.109,7,0,1
14,Taj,10,0.704,0.074,0.402,0.095,7,0,0
8,Erinn,9,0.663,0.035,0.301,0.086,5,0,0


## Appendix

In [9]:
# %load https://gist.github.com/ajp619/ddaa0f35627b066ef528/raw/cbbd6c6c1cad286ba5a358b93fd94eddede7c4ba/qtutil.py
# silly utility to launch a qtconsole if one doesn't exist

consoleFlag = True
# consoleFlag = False  # Turn on/off by commenting/uncommenting this line

import psutil

def returnPyIDs():
    pyids = set()
    for pid in psutil.pids():
        try:
            if "python" in psutil.Process(pid).name():
                pyids.add(pid)
        except:
            pass
    return pyids

def launchConsole():
    before_pyids = returnPyIDs()
    %qtconsole
    after_pyids = returnPyIDs()
    newid = after_pyids.difference(before_pyids)
    assert len(newid) == 1
    return list(newid)[0]

try:
    print qtid
except NameError:
    if consoleFlag:
        qtid = launchConsole()
        print qtid
    
if consoleFlag and (qtid not in returnPyIDs()):
    qtid = launchConsole()
    print qtid

10584


In [10]:
# %load https://gist.github.com/ajp619/7dd388315fc824208654/raw/81be07b0e793208641182032e074dbe39bbfa08e/pyprint
def pyprint(myfile):
    from pygments import highlight
    from pygments.lexers import PythonLexer
    from pygments.formatters import HtmlFormatter
    import IPython

    with open(myfile) as f:
        code = f.read()

    formatter = HtmlFormatter()
    return IPython.display.HTML('<style type="text/css">{}</style>{}'.format(
        formatter.get_style_defs('.highlight'),
        highlight(code, PythonLexer(), formatter)))

In [11]:
pyprint("process_votes.py")