In [3]:
import chess.pgn
import pandas as pd
import re

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
nrows_to_read = 20000000 ## кол-во строчек, которое мы хотим прочитать, так как файл весит 20 гигабайт
reader = pd.read_csv("lichess_db_standard_rated_2017-04.pgn",  delimiter=';', index_col=False, nrows = nrows_to_read)
df = reader
df.columns = ["coma"] ## столбец "coma" - все строки из датасета
f_df = df[df["coma"].str.contains('eval')] ## в датасете f_df хранятся только ходы партий
df = f_df ## для дальнейшего удобства создадим копию

In [23]:
import statistics

eval_regex = r'\[%eval\s+([\d.-]+)\]'

def calculate_mean_eval(row): ## функция подсчёта среднего %eval
    eval_matches = re.findall(eval_regex, row)
    eval_values = [float(eval_match) for eval_match in eval_matches]
    if len(eval_values) != 0:
        return sum(eval_values) / len(eval_values)
    else:
        return 0


df['mean_eval'] = df['coma'].apply(calculate_mean_eval)

def calculate_std_eval(row): ## функция подсчёта стандартного отклонения %eval
    eval_matches = re.findall(eval_regex, row)
    eval_values = [float(eval_match) for eval_match in eval_matches]
    if len(eval_values) != 0:
        return statistics.stdev(eval_values)
    else:
        return 0

df['std_eval'] = df['coma'].apply(calculate_std_eval)

df['Move Count'] = df["coma"].str.count(r'%eval') ## подсчёт количества ходов

In [24]:
## работа с %clk

time_pattern = r'\[%clk\s(.*?)\]'
df['Time'] = df['coma'].apply(lambda x: re.findall(time_pattern, x))

# Преобразование времени в числовой тип данных (в секундах)
df['Time'] = df['Time'].apply(lambda x: [sum(int(t) * 60**i for i, t in enumerate(reversed(time.split(':')))) for time in x])

# Вычисление среднего арифметического времени на ход
df['Avg Time'] = df['Time'].apply(lambda x: sum(x) / len(x) if len(x) != 0 else 0)

In [26]:
for index in df.index:
    rating_str = reader.loc[index-8][0]
    rating_only_re = re.sub(r"\D", "", rating_str)
    if len(rating_only_re) != 0:
        rating_int = int(rating_only_re)
    else:
        rating_int = None
    df.at[index, "Rating"] = rating_int

In [31]:
df["Time Control"] = df["Time"].str[0]
df["Time Spent Avg"] = df["Avg Time"]/df["Time Control"]
df.head()

Unnamed: 0,coma,mean_eval,std_eval,Move Count,Time,Avg Time,Rating,Time Control,Time Spent Avg
846,1. d4 { [%eval 0.27] [%clk 0:05:00] } 1... Nf6...,0.703636,2.02955,55,"[300, 300, 299, 299, 298, 299, 296, 299, 295, ...",250.8,2207.0,300.0,0.836
878,1. d4 { [%eval 0.12] [%clk 0:05:00] } 1... d5 ...,-4.186364,10.198569,67,"[300, 300, 298, 298, 297, 296, 293, 294, 291, ...",142.132353,1520.0,300.0,0.473775
894,1. e4 { [%eval 0.29] [%clk 0:10:00] } 1... e5 ...,-8.261429,16.274437,86,"[600, 600, 596, 599, 590, 566, 560, 556, 556, ...",373.581395,1403.0,600.0,0.622636
910,1. d4 { [%eval 0.22] [%clk 0:08:00] } 1... Nf6...,1.091224,1.605751,50,"[480, 480, 478, 477, 476, 475, 476, 473, 476, ...",398.27451,2275.0,480.0,0.829739
942,1. d4 { [%eval 0.14] [%clk 0:10:00] } 1... d5 ...,-4.571818,14.741802,100,"[600, 600, 598, 597, 596, 594, 593, 588, 590, ...",357.89,1811.0,600.0,0.596483


In [33]:
## Классифицируем рейтинги:

def replace_numbers(value):
    if value < 2000:
        return 0
    else:
        return 1
    
df["classification"] = df["Rating"].apply(replace_numbers)

In [43]:
df.dropna(inplace=True)
df.isna().sum()

coma              0
mean_eval         0
std_eval          0
Move Count        0
Time              0
Avg Time          0
Rating            0
Time Control      0
Time Spent Avg    0
classification    0
dtype: int64

In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

X = df[['mean_eval', "std_eval", "Move Count", "Time Spent Avg"]]
y = df['classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 35)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Оценка качества модели
accuracy = accuracy_score(y_test, y_pred)

# Предсказанные вероятности принадлежности к положительному классу
y_pred_probs = model.predict_proba(X_test)[:, 1]

# Истинные метки классов
y_true = y_test 

# Вычисление ROC-AUC
roc_auc = roc_auc_score(y_true, y_pred_probs)

print("ROC-AUC Score:", roc_auc)

print("Accuracy:", accuracy)


ROC-AUC Score: 0.6329503725331015
Accuracy: 0.9134664401019541
