In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et
from rg17 import visualization as visu
from rg17 import co_occ_scores as coos

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="DejaVu Sans")

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../", "TrendApproximation", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")
w2v_model_dir = ph.get("w2v_root_folder")
experiment_dir = ph.get("experiment_dir")
TIME_HOUR_VALS = ph.get("time_hour_vals")
RELEVANCE_TYPE = ph.get("relevance_type")
RELEVANCE_SUBSET = ph.get("relevance_subset")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

In [None]:
player_account_map["Rafael Nadal"] = ["RafaelNadal".lower()]
player_account_map["Stan Wawrinka"] = ["stanwawrinka".lower()]
player_account_map["Andy Murray"] = ["andy_murray".lower()]
player_account_map["Dominic Thiem"] = ["ThiemDomi".lower()]
player_account_map["Novak Djokovic"] = ["DjokerNole".lower()]
player_account_map["Elina Svitolina"] = ["ElinaSvitolina".lower()]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91".lower()]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial".lower()]
player_account_map["Jelena Ostapenko"] = ["OstapenkoFC".lower()]
player_account_map["Simona Halep"] = ["Simona_Halep".lower()]
player_account_map["Karolina Pliskova"] = ["KaPliskova".lower()]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic".lower()]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki".lower()]
player_account_map["Caroline Garcia"] = ["CaroGarcia".lower()]
player_account_map["Kei Nishikori"] = ["keinishikori".lower()]
player_account_map["Marin Cilic"] = ["cilic_marin".lower()]

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    if not ("final" in match_cat_lower and "single" in match_cat_lower):
        keep_this = False
    return keep_this

In [None]:
finals_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
len(schedule_df), len(finals_df)

## Single finals

   * **canceled** matches are not excluded because people may talk about this events as well 

In [None]:
finals_df

In [None]:
players_by_date = {}
for idx, row in finals_df.iterrows():
    date, player_1, player_2 = row["date"], row["playerName active"], row["playerName opponent"]
    if not date in players_by_date:
        players_by_date[date] = []
    players_by_date[date] += [player_1, player_2]
players_by_date

# Player name parts

In [None]:
players = list(set(finals_df["playerName active"]).union(finals_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
player_info_map = {}
for player in players:
    player_info_map[player] = {
        "name_parts": [p.lower() for p in player.split()],
        "accounts": ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    }

## Show multi-account players

In [None]:
for player, info in player_info_map.items():
    if len(info["accounts"]) > 1:
        print(player, info["accounts"])

In [None]:
player_info_map

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("%s/occ_pairs_with_scores.csv" % experiment_dir, sep="|")

### converting string to dict

pair_occs_df["global_val"] = pair_occs_df["global_val"].apply(eval)
pair_occs_df["snapshot_val"] = pair_occs_df["snapshot_val"].apply(eval)

In [None]:
pair_occs_df.head(2)

In [None]:
word_2_set = set(pair_occs_df["word_2"].unique())
word_1_set = set(pair_occs_df["word_1"].unique())
len(word_1_set), len(word_2_set)

In [None]:
def show_missing_words(info_key, word_set):
    for player, info in player_info_map.items():
        diff = list(set(info[info_key]).difference(word_set))
        if len(diff) != 0:
            print("%s: %s missing!" % (player, diff))
            
def show_matching_words(info_key, word_set):
    for player, info in player_info_map.items():
        match = list(set(info[info_key]).intersection(word_set))
        print("%s: %s" % (player, match))

## a.) Checking names (All names are present)

In [None]:
show_missing_words("name_parts", word_2_set)

In [None]:
show_missing_words("name_parts", word_1_set)

show_matching_words("name_parts", word_1_set)

## b.) Checking account names (All main account are present)

In [None]:
show_missing_words("accounts", word_2_set)

In [None]:
show_missing_words("accounts", word_1_set)

show_matching_words("accounts", word_1_set)

# Relevant player words

In [None]:
finals_df.head(2)

In [None]:
finals_df[finals_df["matchScore"] == "Cancelled"]

In [None]:
def list2relevance(values, relevance):
    return dict(zip(values, relevance * np.ones(len(values))))

# TODO: Other relevant events

   * birthday for nadal
   * injuries for other players?
   * wheather? rain? etc???
   * cancelled match?

### Score values

In [None]:
if RELEVANCE_TYPE == "binary":
    OPPONENT_RELEVANCE = 1.0
    WINNER_INFO_RELEVANCE = 1.0
    MATCH_INFO_RELEVANCE = 1.0
    OTHER_PLAYER_RELEVANCE = 0.0
    COMMON_WORD_RELEVANCE = -1.0    
else:
    OPPONENT_RELEVANCE = 5.0
    WINNER_INFO_RELEVANCE = 4.0
    MATCH_INFO_RELEVANCE = 3.0
    OTHER_PLAYER_RELEVANCE = 0.0
    COMMON_WORD_RELEVANCE = -1.0

### Winner Synonyms

In [None]:
winner_synonyms = ["win","won","winner","victori","triumph"]

### Loser Synonyms

In [None]:
loser_synonyms = ["lose", "lost", "beaten"]

### Assign relevances

In [None]:
def get_relevance_record(relevance_subset, date, time, utc_hour, winner, loser, score):
    daily_player_keys = []
    for player in players_by_date[date]:
        if player != winner and player != loser:
            daily_player_keys += player_info_map[player]["name_parts"]
            daily_player_keys += player_info_map[player]["accounts"]
    #print(winner, loser, daily_player_keys)
    if score == "Cancelled":
        to_iterate_on = [(winner, loser, None),(loser, winner, None)]
    else:
        to_iterate_on = [(winner, loser, 1),(loser, winner, 0)]
    res = []
    for p1, p2, is_winner in to_iterate_on:
        n1, n2 = player_info_map[p1]["name_parts"], player_info_map[p2]["name_parts"]
        acc1, acc2 = player_info_map[p1]["accounts"], player_info_map[p2]["accounts"]
        for w in n1+acc1:
            key_exclude = n1+acc1
            key_exclude.remove(w)
            player_relevant = dict()
            # set relevance for winner information (only after the match started)
            if time > utc_hour:
                # in case of "Cancelled" there is only zero relevance
                if is_winner == 1:
                    player_relevant.update(list2relevance(winner_synonyms, WINNER_INFO_RELEVANCE))
                    if relevance_subset != "positive":
                        player_relevant.update(list2relevance(loser_synonyms, -WINNER_INFO_RELEVANCE))
                elif is_winner == 0:
                    if relevance_subset != "positive":
                        player_relevant.update(list2relevance(winner_synonyms, -WINNER_INFO_RELEVANCE))
                    player_relevant.update(list2relevance(loser_synonyms, WINNER_INFO_RELEVANCE))
            # set relevance for final categories
            if date in ["2017-06-06","2017-06-07"]:
                player_relevant.update(list2relevance(["quarter","final"], MATCH_INFO_RELEVANCE))
            elif date in ["2017-06-08","2017-06-09"]:
                player_relevant.update(list2relevance(["semi","final"], MATCH_INFO_RELEVANCE))
            elif date in ["2017-06-10","2017-06-11"]:
                player_relevant.update(list2relevance(["final"], MATCH_INFO_RELEVANCE))
            # set relevance for opponent information
            opponent_relevant = list2relevance(n2+acc2, OPPONENT_RELEVANCE)
            other_player_relevant = list2relevance(daily_player_keys, OTHER_PLAYER_RELEVANCE)
            res.append([date, "%.2i:00" % time, p1, is_winner, w, key_exclude, player_relevant,  p2, opponent_relevant, other_player_relevant])
    return res

In [None]:
relevant_infos = []
for idx, row in finals_df.iterrows():
    date, utc_hour, winner, loser, score = row["date"], row["utc_start_hour"], row["playerName active"], row["playerName opponent"], row["matchScore"]
    for time in TIME_HOUR_VALS:
        relevant_infos += get_relevance_record(RELEVANCE_SUBSET, date, time, utc_hour, winner, loser, score)
relevant_df = pd.DataFrame(relevant_infos, columns=["date","time","player","is_winner","key_word", "key_exclude_words", "key_relevant_words", "opponent","opp_relevant_words", "other_player_relevant"])

In [None]:
relevant_df.head(5)

In [None]:
len(relevant_df)

## Saving daily keywors to file

In [None]:
aggr_keyword_by_day = relevant_df.groupby(by="date")["key_word"].aggregate(lambda x: set(x))
aggr_keyword_by_day.to_csv(ph.get("keywords_for_eval_path"), sep="|")

### Sanity check: wawrinka loser information has relevance only in snapshots after match start

In [None]:
relevant_df[relevant_df["date"]=="2017-06-11"]

# Experimental Scores

In [None]:
len(pair_occs_df)

### word_2 frequencies (maybe further tuning is needed...)

In [None]:
freq_factor = coos.get_word2_frequency_factor(pair_occs_df)

In [None]:
for w in ["tsonga","play","win","nadal","rg17","rolandgarros","french","open"]:
    print(w, freq_factor[w])

In [None]:
%%time
coos.calculate_frequency_val(pair_occs_df, freq_factor)

## a.) Calculate normalization coefficient (r)

In [None]:
snapshot_weight = ph.get("snapshot_weight")
frequency_weight = ph.get("frequency_weight")
print(snapshot_weight, frequency_weight)

In [None]:
%time
coos.calculate_r(pair_occs_df, snapshot_weight, frequency_weight)

In [None]:
pair_occs_df["r"].hist(bins=100)

## b.) rel_count_c

In [None]:
score_vals = ph.get("score_c_vals")
print(score_vals)

In [None]:
%%time
for c in score_vals:
    coos.calculate_rel_count(pair_occs_df, c=c)

In [None]:
pair_occs_df["rel_count_c5"].hist(bins=100)

## c.) norm_c

In [None]:
%%time
for c in score_vals:
    coos.calculate_norm(pair_occs_df, c=c)

In [None]:
pair_occs_df["norm_c5"].hist(bins=100)

## d.) using Rayleigh for word frequency normalization

def rayleigh(x, s=1.0, exp=1, N=1.0):
    var = s**2
    return x / var * np.exp(-1.0 / (2*var) * np.power(x,exp)) * (1.0/N)

x = np.arange(0,1,0.01)
plt.figure(figsize=(20,5))
plt.plot(x,rayleigh(x, s=0.12, exp=2.0, N=10.0), label="s=0.12, exp=2.0")
plt.legend()

pair_occs_df["rayleigh"] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts_norm[x], s=0.15, exp=2.0, N=10.0))

for c in score_vals: 
    pair_occs_df["norm_c%i_plus_ray" % c] = pair_occs_df["norm_c%i" % c] + pair_occs_df["rayleigh"]
    pair_occs_df["rel_count_c%i_plus_ray" % c] = pair_occs_df["rel_count_c%i" % c] + pair_occs_df["rayleigh"]
    print(c)

# Visualization of word statistics

import plotly.plotly as py
import plotly.graph_objs as go

def get_x_y_simple(words):
    x_arr = [word_2_stats.ix[w]["global_val"] for w in words]
    y_arr = [word_2_counts[w] for w in words]
    return x_arr, y_arr
    
def get_x_y_ray(words):
    x_arr = [word_2_stats.ix[w]["global_val"] for w in words]
    y_arr = [rayleigh(word_2_counts_norm[w], s=0.15, exp=2.0, N=10.0) for w in words]
    return x_arr, y_arr

## Evaluated word statistics

### i. ) without rayleigh

pos_words = ["game","play","match","quarter","final"] + winner_synonyms + loser_synonyms + ["king"]
neg_words = ["rolandgarros","frenchopen","clay","slam","set","round","rg17","rolandgarros2017","french","open","rg2017"]
player_words = list(relevant_df["key_word"])

word_sets = [
    ("players", player_words, "blue", 5),
    ("positive relevance", pos_words, "green", 10),
    ("negative relevance", neg_words, "red", 10)
]

data = []
for ws in word_sets:
    label, w_set, color, size = ws
    x_arr, y_arr = get_x_y_simple(w_set)
    data.append(visu.get_trace(label, x_arr, y_arr, w_set, color, size))
fig = go.Figure(data=data, layout=visu.get_layout('Word statistics simple','global_val','word_2 frequency'))
py.iplot(fig, filename='eval_word_stats_simple')

### ii.) with rayleigh

pos_words = ["game","play","match","quarter","final"] + winner_synonyms + loser_synonyms + ["winner","king"]
neg_words = ["rolandgarros","frenchopen","clay","slam","set","round","rg17","rolandgarros2017"] + ["french","open","rg2017"]
player_words = list(relevant_df["key_word"])

word_sets = [
    ("players", player_words, "blue", 5),
    ("positive relevance", pos_words, "green", 10),
    ("negative relevance", neg_words, "red", 10)
]

data = []
for ws in word_sets:
    label, w_set, color, size = ws
    x_arr, y_arr = get_x_y_ray(w_set)
    data.append(visu.get_trace(label, x_arr, y_arr, w_set, color, size))
fig = go.Figure(data=data, layout=visu.get_layout('Word statistics with rayleigh','global_val','ray(normalized word_2 frequency)'))
py.iplot(fig, filename='eval_word_stats_ray')

## All word statistics

### i. ) without rayleigh

pos_neg_words = list(np.union1d(pos_words, neg_words))
relevant_words = np.union1d(pos_neg_words, player_words)
irrelevant_words = list(set(word_2_counts.index).difference(set(relevant_words)))

word_sets = [
    ("irrelevant", irrelevant_words, "yellow", 5),
    ("pos-neg", pos_neg_words, "green", 10),
    ("players", player_words, "blue", 7)
]

data = []
for ws in word_sets:
    label, w_set, color, size = ws
    x_arr, y_arr = get_x_y_simple(w_set)
    data.append(visu.get_trace(label, x_arr, y_arr, w_set, color, size))
fig = go.Figure(data=data, layout=visu.get_layout('Word statistics simple','global_val','word_2 frequency'))
py.iplot(fig, filename='all_word_stats_simple')

### ii.) with rayleigh

pos_neg_words = list(np.union1d(pos_words, neg_words))
relevant_words = np.union1d(pos_neg_words, player_words)
irrelevant_words = list(set(word_2_counts.index).difference(set(relevant_words)))

word_sets = [
    ("irrelevant", irrelevant_words, "yellow", 5),
    ("pos-neg", pos_neg_words, "green", 10),
    ("players", player_words, "blue", 7)
]

data = []
for ws in word_sets:
    label, w_set, color, size = ws
    x_arr, y_arr = get_x_y_ray(w_set)
    data.append(visu.get_trace(label, x_arr, y_arr, w_set, color, size))
fig = go.Figure(data=data, layout=visu.get_layout('Word statistics with rayleigh','global_val','ray(normalized word_2 frequency)'))
py.iplot(fig, filename='all_word_stats_ray')

# Load Word2Vec models

In [None]:
w2v_models = et.load_w2v_models("%s/dim_%i/" % (w2v_model_dir, ph.get("w2v_model_dim")))

In [None]:
et.get_w2v_toplist(w2v_models, ["nadal"], ["2017-06-11T10:00"], top_k=10)

# Load Jaccard and Cosine distances

In [None]:
distance_root_folder = ph.get("distance_root_folder")
jaccard_distances = et.load_distance_model("%s/jaccard.dist" % distance_root_folder)
cosine_distances = et.load_distance_model("%s/cosine.dist" % distance_root_folder)

In [None]:
et.get_distance_toplist(jaccard_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

In [None]:
et.get_distance_toplist(cosine_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

In [None]:
et.get_distance_toplist(cosine_distances, ["@RafaelNadal"], ["2017-06-11T10:00"], top_k=10)

# Results

def get_ndcg_for_relevant_record(rel_rec, rel_cols, score_col, exclude_player_words=True, top_k=None, general_words=None, verbose=False):
    """'general_words' must be a relevance dictionary."""
    time_id, key_word = rel_rec["time"], rel_rec["key_word"]
    snapshot_id = "%sT%s" % (rel_rec["date"], time_id)
    # define relevant words
    relevant_words = dict()
    for rc in rel_cols:
        relevant_words.update(rel_rec[rc])
    if general_words != None:
        relevant_words.update(general_words)
    # define words to be excluded from the toplist    
    if exclude_player_words and "key_exclude_words" in rel_rec:
        to_be_excluded = rel_rec["key_exclude_words"]
    else:
        to_be_excluded = None
    # get toplist
    if score_col == "word_2_vec":
        pred_words = list(et.get_w2v_toplist(w2v_models, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded)["word_2"])
    elif score_col == "jaccard":
        pred_words = list(et.get_distance_toplist(jaccard_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    elif score_col == "cosine":
        pred_words = list(et.get_distance_toplist(cosine_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    else:
        pred_words = list(et.get_toplist(pair_occs_df, [key_word], [snapshot_id], score_col=score_col, excluded_words=to_be_excluded)["word_2"])
    if verbose:
        print(pred_words)
        print(relevant_words)
    ndcg_score = et.ndcg(relevant_words, pred_words, k=top_k)
    return (snapshot_id, rel_rec["date"], time_id, score_col, key_word, ndcg_score)

import multiprocessing, functools

def get_ndcg_single_thread(top_k, row, rel_cols, general_words, score_col):
    return get_ndcg_for_relevant_record(row, rel_cols, score_col, top_k=top_k, general_words=general_words)

def get_ndcg_from_threads(top_k, rel_cols, relevant_df, time_ids, score_cols, general_words, n_threads=1):
    print(len(relevant_df))
    filtered_relevant_df = relevant_df[relevant_df["time"].isin(time_ids)]
    print(len(filtered_relevant_df))
    ndcg_info_list = []
    if n_threads > 1:
        print("Calculating NDCG on %i threads" % n_threads)
    for idx, row in filtered_relevant_df.iterrows():
        if n_threads == 1:
            for score_col in score_cols:
                ndcg_info_list += [get_ndcg_single_thread(top_k, row, rel_cols, general_words, score_col)]
        else:
            f_partial = functools.partial(get_ndcg_single_thread, top_k, row, rel_cols, general_words)
            pool = multiprocessing.Pool(processes=n_threads)
            res = pool.map(f_partial, score_cols)
            pool.close()
            pool.join()
            ndcg_info_list += res
    ndcg_df = pd.DataFrame(ndcg_info_list, columns=["snapshot_id","date","time","score_id","key_word","ndcg"])
    return ndcg_df   

In [None]:
def get_predicted_words(score_col, key_word, snapshot_id, top_k, to_be_excluded):
    if score_col == "word_2_vec":
        pred_words = list(et.get_w2v_toplist(w2v_models, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded)["word_2"])
    elif score_col == "jaccard":
        pred_words = list(et.get_distance_toplist(jaccard_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    elif score_col == "cosine":
        pred_words = list(et.get_distance_toplist(cosine_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    else:
        pred_words = list(et.get_toplist(pair_occs_df, [key_word], [snapshot_id], score_col=score_col, excluded_words=to_be_excluded)["word_2"])
    return pred_words

## Setting general words

finals_df

#score = "rel_count_c2"
score = "norm_c2"
et.get_toplist(pair_occs_df, ["timea"], ["2017-06-08T07:00"], score_col=score).head(20)[["word_1","word_2",score]]

score = "norm_c5"
et.get_toplist(pair_occs_df, ["rafaelnadal"], ["2017-06-11T13:00"], score_col=score).head(20)[["word_1","word_2",score]]

et.get_distance_toplist(cosine_distances, ["wawrinka"], ["2017-06-11T13:00"], top_k=20).head(20)[["word_1","word_2","distance"]]

# TODO: "game" is really a key word???

In [None]:
general_words = {
    "game":MATCH_INFO_RELEVANCE,
    "play":MATCH_INFO_RELEVANCE, 
    "match":MATCH_INFO_RELEVANCE,
}
if RELEVANCE_SUBSET == "discriminative":
    general_words.update({
        "rg17":COMMON_WORD_RELEVANCE,
        "rg2017":COMMON_WORD_RELEVANCE,
        "rolandgarros":COMMON_WORD_RELEVANCE,
        "roland":COMMON_WORD_RELEVANCE,
        "garros":COMMON_WORD_RELEVANCE,
        "rolandgarros2017":COMMON_WORD_RELEVANCE,
        "frenchopen":COMMON_WORD_RELEVANCE,
        "french":COMMON_WORD_RELEVANCE,
        "open":COMMON_WORD_RELEVANCE,
        "clay":COMMON_WORD_RELEVANCE,
        "slam":COMMON_WORD_RELEVANCE,
        "set":COMMON_WORD_RELEVANCE,
        "round":COMMON_WORD_RELEVANCE      
    })
print(general_words)

## Setting score types

In [None]:
score_cols = ["word_2_vec", "jaccard", "cosine"]
#score_cols += ["rel_count_c%i_plus_ray" % i for i in [0,1,2,5,10]]
#score_cols += ["norm_c%i_plus_ray" % i for i in [0,1,2,5,10]]
score_cols += ["rel_count_c%i" % i for i in score_vals]
score_cols += ["norm_c%i" % i for i in score_vals]
print(score_cols)

## Setting time of days

In [None]:
#time_ids = ["%.2i:00" % t for t in TIME_HOUR_VALS]
time_ids = ["%.2i:00" % t for t in [4,7,10,13,16,19]]
time_ids

## Calculate NDCG in parallel

%%time
ndcg_df = get_ndcg_from_threads(20, ["key_relevant_words","opp_relevant_words","other_player_relevant"], relevant_df, time_ids, score_cols, general_words, n_threads=10)#len(time_ids))

In [None]:
%%time
ndcg_df = et.get_ndcg_from_threads(20, ["key_relevant_words","opp_relevant_words","other_player_relevant"], relevant_df, get_predicted_words, time_ids, score_cols, general_words, n_threads=8)

In [None]:
len(ndcg_df)

In [None]:
ndcg_df.head()

In [None]:
ndcg_df.to_csv("%s/finals_eval_ndcg.csv" % experiment_dir, index=False)

In [None]:
ndcg_dfs = pd.read_csv("%s/finals_eval_ndcg.csv" % experiment_dir)

# Mean NDCG performance for score types

In [None]:
ndcg_df.groupby(by="score_id")["ndcg"].mean().sort_values(ascending=False)

In [None]:
ndcg_for_plots = ndcg_df[ndcg_df["score_id"].isin(["word_2_vec","cosine","jaccard","rel_count_c5","norm_c5"])]

In [None]:
paper_rc = {'lines.linewidth': 5,'lines.markersize': 20}              
sns.set_context("paper", rc = paper_rc, font_scale = 4.25)
#sns.set_style("whitegrid")
#sns.set(font="DejaVu Sans")

## i.) Compare co-occurence scores for snapshots

g = sns.factorplot(data=ndcg_for_plots, x="snapshot_id", y="ndcg", hue="score_id", size=10, aspect=3)
g.set_xticklabels(rotation=90)

## ii.) Compare co-occurence scores for date

In [None]:
g = sns.factorplot(data=ndcg_for_plots, x="date", y="ndcg", hue="score_id", size=10, aspect=3)

## iii.)  Compare co-occurence scores for time of day

In [None]:
sns.factorplot(data=ndcg_for_plots, x="time", y="ndcg", hue="score_id", size=10, aspect=3)

## iv.) Difference between players keywords

In [None]:
def show_player_perf(key_words):
    score_filtered = ndcg_for_plots[ndcg_for_plots["key_word"].isin(key_words)]
    score_filtered = score_filtered[score_filtered["score_id"] == "rel_count_c5"]
    score_filtered = score_filtered[score_filtered["date"].isin(["2017-06-08","2017-06-09","2017-06-10","2017-06-11"])]
    g = sns.factorplot(data=score_filtered, x="snapshot_id", y="ndcg", hue="key_word", size=10, aspect=3)
    g.set_xticklabels(rotation=90)

In [None]:
last_names = ["nadal","wawrinka","ostapenko","halep","murray","djokovic","cilic","thiem","pliskova","bacsinszky"]
first_names = ["rafael","stan","jelena","simona","andy","novak","marin","dominic","karolina","timea"]
account_names = [n.lower() for n in ["@RafaelNadal","@stanwawrinka","@OstapenkoFC","@Simona_Halep","@KaPliskova","@andy_murray","@DjokerNole", "@cilic_marin", "@ThiemDomi", "@KaPliskova", "@TimeaOfficial"]]

In [None]:
show_player_perf(last_names)
show_player_perf(first_names)
show_player_perf(account_names)