In [1]:
from dotenv import find_dotenv, load_dotenv
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from tslearn.clustering import TimeSeriesKMeans
import time
from sklearn import metrics
from tslearn.clustering import TimeSeriesKMeans
from tslearn.clustering import silhouette_score
import numpy as np
from warnings import filterwarnings
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tslearn.utils import to_time_series_dataset
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from src.data.fics_dataset import  FICSDatasetBuilder

filterwarnings("ignore")



In [2]:
PROJECT_PATH=os.getenv('PROJECT_PATH')
NUMBER_OF_GAMES_MINIMUM=int(os.getenv('NUMBER_OF_GAMES_MINIMUM'))
FICS_MIN_YEAR=int(os.getenv('FICS_MIN_YEAR'))
FICS_MAX_YEAR=int(os.getenv('FICS_MAX_YEAR'))
NUMBER_OF_GAMES_MINIMUM=int(os.getenv('NUMBER_OF_GAMES_MINIMUM'))
BUILD_FICS_GAME_TYPE=(os.getenv('BUILD_FICS_GAME_TYPE'))


In [3]:
def get_samples(players_dict,offset,num_samples):
    samples={}
    for p in players_dict:
        try:
            v=players_dict[p]
            arr=list(v[(v.index<offset+num_samples)&(v.index>=offset)]["Elo"])
            samples[p]=arr
        except Exception as err:
            print(err)
            pass
        
    for p in list(samples.keys()):
        if len(samples[p])!=num_samples:
            del samples[p]

    
    return samples

def get_samples_ts(players_dict,offset,num_samples):
    samples={}
    for p in players_dict:
        try:
            v=players_dict[p]
            arr_Timestamp=(v[(v.index<offset+num_samples)&(v.index>=offset)]["Timestamp"])
            samples[p]=arr_Timestamp
        except Exception as err:
            print(err)
            pass
        
    for p in list(samples.keys()):
        if len(samples[p])!=num_samples:
            del samples[p]

    print(samples)
    return samples

def dp(dist_mat):
    """
    Find minimum-cost path through matrix `dist_mat` using dynamic programming.

    The cost of a path is defined as the sum of the matrix entries on that
    path. See the following for details of the algorithm:

    - http://en.wikipedia.org/wiki/Dynamic_time_warping
    - https://www.ee.columbia.edu/~dpwe/resources/matlab/dtw/dp.m

    The notation in the first reference was followed, while Dan Ellis's code
    (second reference) was used to check for correctness. Returns a list of
    path indices and the cost matrix.
    """

    N, M = dist_mat.shape

    # Initialize the cost matrix
    cost_mat = np.zeros((N + 1, M + 1))
    for i in range(1, N + 1):
        cost_mat[i, 0] = np.inf
    for i in range(1, M + 1):
        cost_mat[0, i] = np.inf

    # Fill the cost matrix while keeping traceback information
    traceback_mat = np.zeros((N, M))
    for i in range(N):
        for j in range(M):
            penalty = [
                cost_mat[i, j],      # match (0)
                cost_mat[i, j + 1],  # insertion (1)
                cost_mat[i + 1, j]]  # deletion (2)
            i_penalty = np.argmin(penalty)
            cost_mat[i + 1, j + 1] = dist_mat[i, j] + penalty[i_penalty]
            traceback_mat[i, j] = i_penalty

    # Traceback from bottom right
    i = N - 1
    j = M - 1
    path = [(i, j)]
    while i > 0 or j > 0:
        tb_type = traceback_mat[i, j]
        if tb_type == 0:
            # Match
            i = i - 1
            j = j - 1
        elif tb_type == 1:
            # Insertion
            i = i - 1
        elif tb_type == 2:
            # Deletion
            j = j - 1
        path.append((i, j))

    # Strip infinity edges from cost_mat before returning
    cost_mat = cost_mat[1:, 1:]
    return (path[::-1], cost_mat)


def kmeans_silhoutte(X_train,metric,max_k=10):
    silhoutte_score = []
    for i in range(2,max_k+1):
        start = time.time()
        model= TimeSeriesKMeans(n_clusters=i,metric=metric,n_jobs=-1,random_state=0)
        y_pred = model.fit_predict(X_train)
        score = silhouette_score(X_train, y_pred, metric=metric)
        silhoutte_score.append(score)
        stop = time.time()
        duration = stop - start
        print(f"K = {i}.", f"Took {duration:.2f} seconds to calculate.")

    plt.figure(figsize=(20,10))
    plt.grid()
    plt.plot(range(2,10),silhoutte_score,marker='o',linestyle='--')
    plt.xlabel('number of clusters')
    plt.ylabel('silhoutte');
    plt.title("Silhouette analysis For Optimal K and Optimal Model")
    plt.savefig(f"{PROJECT_PATH}/reports/figures/fics/kmeans-Silhouette-plot-{metric}-{max_k}.png")

    return silhoutte_score

def kmeans_calinski_harabasz(X_train,metric,max_k=10):
    scores = []
    for i in range(2,max_k+1):
        start = time.time()
        model= TimeSeriesKMeans(n_clusters=i,metric=metric,n_jobs=-1,random_state=0)
        y_pred = model.fit_predict(X_train)
        score = metrics.calinski_harabasz_score(X_train, y_pred)
        scores.append(score)
        stop = time.time()
        duration = stop - start
        print(f"K = {i}.", f"Took {duration:.2f} seconds to calculate.")

    plt.figure(figsize=(20,10))
    plt.grid()
    plt.plot(range(2,10),scores,marker='o',linestyle='--')
    plt.xlabel('number of clusters')
    plt.ylabel('calinski harabasz');
    plt.title("calinski harabasz For Optimal K and Optimal Model")
    plt.savefig(f"{PROJECT_PATH}/reports/figures/fics/kmeans-calinski-harabasz-plot-{metric}-{max_k}.png")

    return scores



def kmeans_wcsss(X_train,metric,model_name,max_k=10):
    wcss = []
    for i in range(2,max_k+1):
        start = time.time()
        model= TimeSeriesKMeans(n_clusters=i,metric=metric,n_jobs=-1,random_state=0)
        model.fit_predict(X_train)
        print(model.inertia_)
        wcss.append(model.inertia_)
        stop = time.time()
        duration = stop - start
        print(f"K = {i}.", f"Took {duration:.2f} seconds to calculate.")

    plt.figure(figsize=(20,max_k+1))
    plt.grid()
    plt.plot(range(2,max_k+1),wcss,marker='o',linestyle='--')
    plt.xlabel('number of clusters')
    plt.ylabel('WCSSS');
    plt.title("WCSSS analysis For Optimal K and Optimal Model")
    plt.savefig(f"{PROJECT_PATH}/reports/figures/fics/kmeans-wcsss-plot-{model_name}.png")

    return wcss

def kmeans_cluster(n_clusters,algo,X_train,metric):
    model= TimeSeriesKMeans(n_clusters=n_clusters,
                        metric=metric,
                        n_jobs=-1,
                        random_state=0)
    y_pred = model.fit_predict(X_train)
    # model.to_pickle(f"{PROJECT_PATH}/models/fics/{algo}.pkl")



    for yi in range(len(np.unique(y_pred))):
      for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)


      plt.plot(model.cluster_centers_[yi].ravel(), "r-")
      plt.xlim(0, X_train.shape[1])
      plt.ylim(-5, 5)
      plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), transform=plt.gca().transAxes)
      plt.title("kmeans")
      plt.tight_layout()
      plt.savefig(f"{PROJECT_PATH}/reports/figures/fics/{algo}-clusters-{(yi + 1)}-{n_clusters}-{metric}.png")
      plt.show();
  

    for i in range(len(np.unique(y_pred))):
      count = (sum(y_pred == i)/len(y_pred))*100
      print(f"The {count:.2f}% of the observations fall into Cluster {i + 1}")

    return y_pred



In [4]:
with open(f"{PROJECT_PATH}/data/fics/interim/players_dict_v2.pkl", 'rb') as f:
    players_dict=pickle.load(f)

In [5]:
df=pd.DataFrame.from_dict(get_samples(players_dict,0,150)).T

In [6]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
osvifr,1597,1603,1594,1601,1603,1610,1601,1591,1596,1602,...,1560,1566,1559,1564,1569,1559,1566,1570,1578,1576
Anmelus,1510,1481,1469,1458,1526,1576,1528,1567,1541,1498,...,1505,1499,1495,1491,1506,1504,1500,1496,1492,1488
Tohotmos,1582,1543,1536,1535,1541,1550,1540,1532,1542,1549,...,1549,1536,1524,1515,1528,1516,1509,1517,1515,1527
denij,1501,1482,1459,1471,1476,1481,1494,1500,1506,1511,...,1497,1500,1516,1490,1491,1500,1499,1497,1516,1524
diegoandrescas,1440,1435,1444,1453,1462,1453,1450,1445,1451,1457,...,1393,1405,1411,1418,1427,1436,1444,1436,1428,1421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PaulVerhoeven,1757,1747,1754,1760,1766,1771,1778,1785,1784,1791,...,1707,1700,1694,1704,1699,1711,1717,1717,1709,1702
davefromreading,1332,1368,1303,1344,1312,1335,1304,1324,1337,1350,...,1408,1410,1413,1413,1422,1412,1408,1396,1401,1408
schlechterZug,1867,1914,1774,1816,1738,1801,1842,1891,1883,1906,...,1956,1963,1970,1977,1982,1988,1994,2000,2006,1999
replo,1046,1093,1128,1094,1117,1113,1109,1136,1162,1190,...,1251,1229,1217,1198,1212,1220,1236,1251,1255,1243


In [7]:
def fix(df,players):

    try:
        df=df[(df["White"].isin(players))|(df["Black"].isin(players))]

        #
        df1=df[(df["White"].isin(players))&(df["Black"].isin(players))]
        df1=df1[["White","WhiteElo","UTCDate","UTCTime","FICSGamesDBGameNo"]].rename(columns={"White":"Player","WhiteElo":"Elo"})

        df2=df[(df["White"].isin(players))&(~df["Black"].isin(players))]
        df2=df2[["White","WhiteElo","UTCDate","UTCTime","FICSGamesDBGameNo"]].rename(columns={"White":"Player","WhiteElo":"Elo"})

        #
        df3=df[(~df["White"].isin(players))&(df["Black"].isin(players))]
        df3=df3[["Black","BlackElo","UTCDate","UTCTime","FICSGamesDBGameNo"]].rename(columns={"Black":"Player","BlackElo":"Elo"})

        #
        df4=df[(df["White"].isin(players))&(df["Black"].isin(players))]
        df4=df4[["Black","BlackElo","UTCDate","UTCTime","FICSGamesDBGameNo"]].rename(columns={"Black":"Player","BlackElo":"Elo"})

        #
        df=pd.concat([df1,df2,df3,df4])
    except Exception as err:
        df=df[(df["White"].isin(players))|(df["Black"].isin(players))]

        #
        df1=df[(df["White"].isin(players))&(df["Black"].isin(players))]
        df1=df1[["White","WhiteElo","Date","Time","FICSGamesDBGameNo"]].rename(columns={"White":"Player","WhiteElo":"Elo"})

        df2=df[(df["White"].isin(players))&(~df["Black"].isin(players))]
        df2=df2[["White","WhiteElo","Date","Time","FICSGamesDBGameNo"]].rename(columns={"White":"Player","WhiteElo":"Elo"})

        #
        df3=df[(~df["White"].isin(players))&(df["Black"].isin(players))]
        df3=df3[["Black","BlackElo","Date","Time","FICSGamesDBGameNo"]].rename(columns={"Black":"Player","BlackElo":"Elo"})

        #
        df4=df[(df["White"].isin(players))&(df["Black"].isin(players))]
        df4=df4[["Black","BlackElo","Date","Time","FICSGamesDBGameNo"]].rename(columns={"Black":"Player","BlackElo":"Elo"})

        #
        df=pd.concat([df1,df2,df3,df4])
    df=df.dropna()
    df["Year"]=df["Date"].apply(lambda r:int(str(r).split(".")[0]))
    df["Month"]=df["Date"].apply(lambda r:int(str(r).split(".")[1]))
    df["Day"]=df["Date"].apply(lambda r:int(str(r).split(".")[2]))
    df["Hour"]=df["Time"].apply(lambda r:int(str(r).split(":")[0]))
    df["Minute"]=df["Time"].apply(lambda r:int(str(r).split(":")[1]))
    df["Second"]=df["Time"].apply(lambda r:int(str(r).split(":")[2]))

    return df.sort_values(by="FICSGamesDBGameNo")

###############################
extra_player={}
for player in df.index:
    try:
        path=f"/home/ariel/Documents/bgu/chess_improments_patterns/data/fics-build/proccessed/{player}/chess-games.csv"
        df_p=pd.read_csv(path)
        extra_player[player]=df_p
    except Exception as err:
        pass
    
extra_player_df=pd.concat([extra_player[p] for p in extra_player.keys()])

extra_player_df=pd.concat([extra_player[p] for p in extra_player.keys()])

extra_player_df_fixed=fix(extra_player_df,list(extra_player.keys()))
extra_player_df_fixed.to_csv(f"{PROJECT_PATH}/data/fics/interim/hell_v2.csv")


len(set(extra_player_df_fixed["Player"]))

317

In [11]:
from scipy.stats import skew
from tsfresh.feature_extraction.feature_calculators import mean_abs_change
from scipy.stats import shapiro


def get_df(df):
    df_dt=df.copy()

    for i in range(1,len(df_dt.columns)):
        df_dt[i]=df[i]-df[i-1]

    df_dt[0]=df[0]-df[0]
    return df_dt

def find_features(df,features):
    df_dt=get_df(df)

    feature_values=[[] for i in range(len(features))]


    for p in df.index:
        c=list(df[df.index==p].values)[0]
        for i in range(len(features)):
            feature_values[i].append(features[i]["func"](c))

    for i in range(len(features)):
        
        if (shapiro(feature_values[i]).pvalue)< 0.1:
            print(shapiro(feature_values[i]).pvalue)
            df_dt[features[i]["name"]]=feature_values[i]

    return df_dt





In [10]:

def thread_function(players,id,num_threads):
    for i in range(len(players)):
        if i%num_threads==id:
            if not os.path.exists("/home/ariel/Documents/bgu/chess_improments_patterns/data/fics-build/proccessed/"+players[i]):
                print(players[i])
                FICSDatasetBuilder(base_dir=PROJECT_PATH,game_type="player",years=years).download_per_player(players[i])

In [None]:
import concurrent.futures
import time


# Create a ThreadPoolExecutor with 3 worker threads
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    tasks = [executor.submit(thread_function, players,i,10) for i in range(10)]
    
    # Wait for all tasks to complete
    for future in concurrent.futures.as_completed(tasks):
        # Retrieve the result of each task
        result = future.result()
        print("Task result:", result)


In [None]:
import concurrent.futures
import logging

# [rest of code]

if __name__ == "__main__":
    format = "%(asctime)s: %(message)s"
    logging.basicConfig(format=format, level=logging.INFO,
                        datefmt="%H:%M:%S")

    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        executor.map(thread_function, range(3))


In [7]:
from  tsfresh.feature_extraction.feature_calculators import kurtosis
all_features=[
    {"name":"mean","func":lambda c:np.mean(c)},
    {"name":"skew","func":lambda c:skew(c)},
    {"name":"min","func":lambda c:min(c)},
    {"name":"max","func":lambda c:max(c)},
    {"name":"var","func":lambda c:np.var(c)},
    {"name":"kurtosis","func":lambda c:kurtosis(c)},
    ]


find_features(df,all_features)


3.653154335190909e-35
0.0
3.662291676399284e-31
5.918049268088846e-34
0.0
0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,mean,skew,min,max,var,kurtosis
osvifr,0,6,-9,7,2,7,-9,-10,5,6,...,7,4,8,-2,1585.913333,0.220103,1551,1628,318.945822,-0.840536
Anmelus,0,-29,-12,-11,68,50,-48,39,-26,-43,...,-4,-4,-4,-4,1522.046667,0.382622,1452,1630,1041.311156,0.043917
Tohotmos,0,-39,-7,-1,6,9,-10,-8,10,7,...,-7,8,-2,12,1537.946667,-0.226261,1471,1589,674.157156,-0.513487
denij,0,-19,-23,12,5,5,13,6,6,5,...,-1,-2,19,8,1496.960000,0.053727,1434,1579,872.198400,-0.267987
diegoandrescas,0,-5,9,9,9,-9,-3,-5,6,6,...,8,-8,-8,-7,1396.766667,0.573076,1350,1462,647.725556,-0.321189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PaulVerhoeven,0,-10,7,6,6,5,7,7,-1,7,...,6,0,-8,-7,1761.886667,0.560898,1694,1854,1524.633822,-0.621038
davefromreading,0,36,-65,41,-32,23,-31,20,13,13,...,-4,-12,5,7,1371.993333,-0.405881,1288,1440,1347.633289,-0.882601
schlechterZug,0,47,-140,42,-78,63,41,49,-8,23,...,6,6,6,-7,1933.193333,-0.869167,1738,2012,1999.982622,2.590800
replo,0,47,35,-34,23,-4,-4,27,26,28,...,16,15,4,-12,1234.400000,-1.795181,1046,1303,1693.786667,4.573385
