In [1]:
import pyspark
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext


In [2]:
os.chdir('../Project/finding-elo/')
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [3]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

In [4]:
def myfunc(a):
    if "1/2" in a:
        return 0
    if "1-0" in a:
        return 1
    else:
        return -1

def eval_moves(moves):
    score = 0
    for i in range(len(moves)-1):
        score = score + moves[i+1] - moves[i]
    return score
        

In [5]:
result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))
whiteelo = games.filter(lambda l: "WhiteElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
blackelo = games.filter(lambda l: "BlackElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))


In [None]:
temp = whiteelo.join(blackelo).join(result).join(stockfish).map(lambda l:[l[1][0][0][0],l[1][0][0][1],l[1][0][0][0]-l[1][0][0][1],l[1][0][1],l[1][1]])
df = temp.toDF(['white_rating','black_rating','rating_diff','result','score'])


In [None]:
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")
df = vectorAssembler.transform(df)
white_df = df.select(['features', 'white_rating'])
black_df = df.select(['features','black_rating'])


splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

lr = LinearRegression(featuresCol = 'features', labelCol='white_rating', maxIter=10)
lr_model_white = lr.fit(train_df)
lr_predictions = lr_model_white.transform(test_df)
lr_predictions.select("prediction","white_rating","features").show(5)


In [None]:
test_result = lr_model_white.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

In [None]:
print("Coefficients: %s" % str(lr_model_white.coefficients))
print("Intercept: %s" % str(lr_model_white.intercept))


In [None]:
splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='black_rating', maxIter=10)
lr_model_black = lr.fit(train_df)
lr_predictions = lr_model_black.transform(test_df)
lr_predictions.select("prediction","black_rating","features").show(5)

In [None]:
test_result = lr_model_black.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

In [None]:
print("Coefficients: %s" % str(lr_model_black.coefficients))
print("Intercept: %s" % str(lr_model_black.intercept))



In [None]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

In [None]:
result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] >= 25000).map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] >= 25000).map(lambda l:(l[1],l[0]))

In [None]:
test_set = result.join(stockfish).map(lambda l:[l[0]+1,l[1][0],l[1][1]]).toDF(['Event','result','score'])
vectorAssembler = VectorAssembler(inputCols = ['result','score'], outputCol = "features")
test_set = vectorAssembler.transform(test_set)
test_set_white = lr_model_white.transform(test_set)
test_set_black = lr_model_black.transform(test_set)

black = test_set_black.select('Event','prediction').toPandas()
black.columns=['Event','BlackElo']

white = test_set_white.select('Event','prediction').toPandas()
white.columns = ['Event','WhiteElo']



In [None]:

white['BlackElo'] = black.BlackElo
white.sort_values(by='Event').set_index('Event').to_csv('LR.csv')