In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stats.burstACS import ExtractBurstData # custom first passage time extraction function

# setup input/output file names
inputFile="data/team-results.csv"
outputFile="data/team-passage-times-ks.csv"

# custom Kolmogorov-Smirnov test function
def twoSidedKSTest(series1,series2,bins=None,p=0.05):
    # return true 
    # if we cannot reject null hypothesis
    # that two empirical distributions are the same
    cdf1=np.array([np.sum(series1<l) for l in bins])/len(series1)
    cdf2=np.array([np.sum(series2<l) for l in bins])/len(series2)
    diff=np.max(np.abs(cdf1-cdf2))
    cAlpha=np.sqrt(-0.5*np.log(p/2))
    return diff<cAlpha*np.sqrt((len(series1)+len(series2))/(len(series1)*len(series2)))

Read the original data.

In [2]:
df=pd.read_csv(inputFile)
df.head()

Unnamed: 0,ATL,BKN,BOS,CHA,CHI,CLE,DAL,DEN,DET,GSW,...,OKC,ORL,PHI,PHX,POR,SAC,SAS,TOR,UTA,WAS
0,-1,-1,-1,-1,1,-1,1,-1,-1,-1,...,-1,1,1,-1,-1,1,-1,1,1,-1
1,1,1,-1,1,1,-1,1,-1,-1,-1,...,1,-1,-1,-1,1,1,1,-1,-1,1
2,-1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,-1,1,-1,1,1,-1,1,-1
3,1,-1,-1,-1,1,-1,1,-1,1,-1,...,1,1,1,-1,-1,1,1,-1,1,1
4,1,1,-1,-1,1,-1,-1,-1,1,1,...,-1,1,-1,1,1,1,1,-1,1,-1


Compare streak distributions using K-S test. The original data is compared against:
* itself (win streaks agains loss streaks),
* fully shuffled series,
* random series with 50-50 chance of win/loss,
* random series with win/loss probability adjusted to the original record.

In [3]:
rez=[]
# produce a single random 50-50 win/loss series
randomSeries=np.random.choice([1,-1],size=df.shape[0])
rwinStreaks,rloseStreaks=ExtractBurstData(randomSeries,0,
                                          samplePeriod=1,returnBurst=True,returnInterBurst=True)
# we would like to examine streaks up to 20 wins/losses long
bins=np.arange(0,20)
for team in df.columns:# for all teams
    if team=="CHA" or team=="NOP":# except CHA and NOP
        continue
    # examine the original series
    winSeries=np.array(df[team].values.copy())
    winStreaks,loseStreaks=ExtractBurstData(winSeries,0,
                                            samplePeriod=1,returnBurst=True,returnInterBurst=True)
    wlSame=twoSidedKSTest(winStreaks,loseStreaks,bins=bins,p=0.05)
    # shuffled series
    np.random.shuffle(winSeries)
    swinStreaks,sloseStreaks=ExtractBurstData(winSeries,0,
                                              samplePeriod=1,returnBurst=True,returnInterBurst=True)
    wswSame=twoSidedKSTest(winStreaks,swinStreaks,bins=bins,p=0.05)
    lslSame=twoSidedKSTest(loseStreaks,sloseStreaks,bins=bins,p=0.05)
    # random series 50-50
    wrwSame=twoSidedKSTest(winStreaks,rwinStreaks,bins=bins,p=0.05)
    lrlSame=twoSidedKSTest(loseStreaks,rloseStreaks,bins=bins,p=0.05)
    # random series with p estimated from the data
    winProb=(len(winSeries)+np.sum(winSeries))/(2*len(winSeries))
    srandomSeries=np.random.choice([1,-1],size=df.shape[0],p=[winProb,1-winProb])
    srwinStreaks,srloseStreaks=ExtractBurstData(srandomSeries,0,
                                                samplePeriod=1,returnBurst=True,returnInterBurst=True)
    wsrwSame=twoSidedKSTest(winStreaks,srwinStreaks,bins=bins,p=0.05)
    lsrlSame=twoSidedKSTest(loseStreaks,srloseStreaks,bins=bins,p=0.05)
    # -------------------
    rez+=[[team,winProb,wlSame,wswSame,lslSame,wrwSame,lrlSame,wsrwSame,lsrlSame],]
del team, bins, randomSeries, rwinStreaks, rloseStreaks, winSeries, winStreaks, loseStreaks
del wlSame, swinStreaks, sloseStreaks, wswSame, lslSame, wrwSame, lrlSame, winProb
del srandomSeries, srwinStreaks, srloseStreaks, wsrwSame, lsrlSame

ddf=pd.DataFrame(rez,columns=["team","winProb","WL","WshuffW","LshuffL","Wrandom50-50W","Lrandom50-50L","WrandompW","LrandompL"])
ddf

Unnamed: 0,team,winProb,WL,WshuffW,LshuffL,Wrandom50-50W,Lrandom50-50L,WrandompW,LrandompL
0,ATL,0.478237,True,True,True,True,True,True,True
1,BKN,0.423286,False,True,True,True,True,True,True
2,BOS,0.511425,True,True,True,True,True,True,True
3,CHI,0.510881,True,True,True,True,True,True,True
4,CLE,0.502176,True,True,True,True,True,True,True
5,DAL,0.557127,False,True,True,True,True,True,True
6,DEN,0.472252,True,True,True,True,True,True,True
7,DET,0.519587,True,True,True,True,True,True,True
8,GSW,0.474973,True,True,True,True,True,True,True
9,HOU,0.572361,False,True,True,False,False,True,True


In [4]:
ddf.to_csv(outputFile,float_format="%.2f")