In [1]:
import pyspark
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [2]:
os.chdir('../Project/finding-elo/')
sc = pyspark.SparkContext()
spark = SparkSession(sc)


In [3]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

In [4]:
def myfunc(a):
    if "1/2" in a:
        return 0
    if "1-0" in a:
        return 1
    else:
        return -1

def eval_moves(moves):
    score = 0
    for i in range(len(moves)-1):
        score = score + moves[i+1] - moves[i]
    return score
        

In [5]:
result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))
whiteelo = games.filter(lambda l: "WhiteElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
blackelo = games.filter(lambda l: "BlackElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))



In [6]:

temp = whiteelo.join(blackelo).join(result).join(stockfish).map(lambda l:[l[1][0][0][0],l[1][0][0][1],l[1][0][0][0]-l[1][0][0][1],l[1][0][1],l[1][1]])
df = temp.toDF(['white_rating','black_rating','rating_diff','result','score'])
df.show()

+------------+------------+-----------+------+-----+
|white_rating|black_rating|rating_diff|result|score|
+------------+------------+-----------+------+-----+
|        2354|        2411|        -57|     0|   54|
|        2684|        2403|        281|     0| -185|
|        2101|        1767|        334|     1| 5193|
|        2269|        2469|       -200|    -1|  368|
|        1279|        1813|       -534|    -1| -829|
|        2600|        2327|        273|     1| 1750|
|        1786|        1622|        164|     1|  370|
|        2361|        2362|         -1|     1|  566|
|        2546|        2589|        -43|    -1| -394|
|        2084|        2218|       -134|     0|  108|
|        2512|        2536|        -24|     0|   49|
|        2093|        1799|        294|     1|  463|
|        2333|        2320|         13|     0|   11|
|        2265|        2247|         18|     0|    0|
|        2266|        2462|       -196|     0|   62|
|        2157|        2481|       -324|    -1|

In [7]:
df_temp = df.filter(df.result==-1)
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")
df_temp = vectorAssembler.transform(df)
white_df = df_temp.select(['features', 'white_rating'])
black_df = df_temp.select(['features','black_rating'])

splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='white_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_white_m1 = lr.fit(train_df)
lr_predictions = lr_model_white_m1.transform(test_df)
lr_predictions.select("prediction","white_rating","features").show(5)

test_result = lr_model_white_m1.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='black_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_black_m1 = lr.fit(train_df)
lr_predictions = lr_model_black_m1.transform(test_df)
lr_predictions.select("prediction","black_rating","features").show(5)

test_result = lr_model_black_m1.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

+------------------+------------+---------+
|        prediction|white_rating| features|
+------------------+------------+---------+
|2241.4826983982853|        1610|(2,[],[])|
|2241.4826983982853|        1663|(2,[],[])|
|2241.4826983982853|        1768|(2,[],[])|
|2241.4826983982853|        1786|(2,[],[])|
|2241.4826983982853|        1803|(2,[],[])|
+------------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 208.093
+------------------+------------+---------+
|        prediction|black_rating| features|
+------------------+------------+---------+
|2248.6165070516213|        1707|(2,[],[])|
|2248.6165070516213|        1729|(2,[],[])|
|2248.6165070516213|        1933|(2,[],[])|
|2248.6165070516213|        1946|(2,[],[])|
|2248.6165070516213|        1953|(2,[],[])|
+------------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 210.153


In [8]:
df_temp = df.filter(df.result==0)
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")
df_temp = vectorAssembler.transform(df)
white_df = df_temp.select(['features', 'white_rating'])
black_df = df_temp.select(['features','black_rating'])

splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='white_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_white_0 = lr.fit(train_df)
lr_predictions = lr_model_white_0.transform(test_df)
lr_predictions.select("prediction","white_rating","features").show(5)

test_result = lr_model_white_0.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='black_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_black_0 = lr.fit(train_df)
lr_predictions = lr_model_black_0.transform(test_df)
lr_predictions.select("prediction","black_rating","features").show(5)

test_result = lr_model_black_0.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

+-----------------+------------+---------+
|       prediction|white_rating| features|
+-----------------+------------+---------+
|2238.774254696699|        1690|(2,[],[])|
|2238.774254696699|        1725|(2,[],[])|
|2238.774254696699|        1768|(2,[],[])|
|2238.774254696699|        1824|(2,[],[])|
|2238.774254696699|        1865|(2,[],[])|
+-----------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 207.024
+------------------+------------+---------+
|        prediction|black_rating| features|
+------------------+------------+---------+
|2248.9015819588612|        1755|(2,[],[])|
|2248.9015819588612|        1824|(2,[],[])|
|2248.9015819588612|        1880|(2,[],[])|
|2248.9015819588612|        1946|(2,[],[])|
|2248.9015819588612|        1948|(2,[],[])|
+------------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 210.594


In [9]:
df_temp = df.filter(df.result==1)
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")
df_temp = vectorAssembler.transform(df)
white_df = df_temp.select(['features', 'white_rating'])
black_df = df_temp.select(['features','black_rating'])

splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='white_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_white_1 = lr.fit(train_df)
lr_predictions = lr_model_white_1.transform(test_df)
lr_predictions.select("prediction","white_rating","features").show(5)

test_result = lr_model_white_1.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='black_rating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model_black_1 = lr.fit(train_df)
lr_predictions = lr_model_black_1.transform(test_df)
lr_predictions.select("prediction","black_rating","features").show(5)

test_result = lr_model_black_1.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

+------------------+------------+---------+
|        prediction|white_rating| features|
+------------------+------------+---------+
|2238.8871724641476|        1725|(2,[],[])|
|2238.8871724641476|        1846|(2,[],[])|
|2238.8871724641476|        1872|(2,[],[])|
|2238.8871724641476|        1902|(2,[],[])|
|2238.8871724641476|        1966|(2,[],[])|
+------------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 206.38
+------------------+------------+---------+
|        prediction|black_rating| features|
+------------------+------------+---------+
|2249.5503246218996|        1729|(2,[],[])|
|2249.5503246218996|        1779|(2,[],[])|
|2249.5503246218996|        1797|(2,[],[])|
|2249.5503246218996|        1812|(2,[],[])|
|2249.5503246218996|        1824|(2,[],[])|
+------------------+------------+---------+
only showing top 5 rows

Mean Absolute Error on test data = 213.579


In [10]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] >= 25000).map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] >= 25000).map(lambda l:(l[1],l[0]))

In [11]:
test_set = result.join(stockfish).map(lambda l:[l[0]+1,l[1][0],l[1][1]]).toDF(['Event','result','score'])
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")

test_set_m1 = vectorAssembler.transform(test_set.filter(test_set.result==-1))
test_set_0 = vectorAssembler.transform(test_set.filter(test_set.result==0))
test_set_1 = vectorAssembler.transform(test_set.filter(test_set.result==1))

test_set_white_m1 = lr_model_white_m1.transform(test_set_m1)
test_set_white_0 = lr_model_white_0.transform(test_set_0)
test_set_white_1 = lr_model_white_1.transform(test_set_1)


test_set_black_m1 = lr_model_black_m1.transform(test_set_m1)
test_set_black_0 = lr_model_black_0.transform(test_set_0)
test_set_black_1 = lr_model_black_1.transform(test_set_1)





black_m1 = test_set_black_m1.select('Event','prediction').toPandas()
black_1 = test_set_black_1.select('Event','prediction').toPandas()
black_0 = test_set_black_0.select('Event','prediction').toPandas()

black_m1.columns=['Event','BlackElo']
black_0.columns=['Event','BlackElo']
black_1.columns=['Event','BlackElo']

white_m1 = test_set_white_m1.select('Event','prediction').toPandas()
white_1 = test_set_white_1.select('Event','prediction').toPandas()
white_0 = test_set_white_0.select('Event','prediction').toPandas()


white_m1.columns=['Event','WhiteElo']
white_0.columns=['Event','WhiteElo']
white_1.columns = ['Event','WhiteElo']



In [12]:
black = black_m1.append(black_0).append(black_1)
white = white_m1.append(white_0).append(white_1)

In [13]:

white['BlackElo'] = black.BlackElo
white.sort_values(by='Event').set_index('Event').to_csv('Segmented_LR.csv')