In [1]:
import pyspark
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext


In [2]:
os.chdir('../Project/finding-elo/')
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [3]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

In [4]:
def myfunc(a):
    if "1/2" in a:
        return 0
    if "1-0" in a:
        return 1
    else:
        return -1

def eval_moves(moves):
    score = 0
    for i in range(len(moves)-1):
        score = score + moves[i+1] - moves[i]
    return score
        

In [5]:
result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))
whiteelo = games.filter(lambda l: "WhiteElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
blackelo = games.filter(lambda l: "BlackElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))


In [6]:
temp = whiteelo.join(blackelo).join(result).join(stockfish).map(lambda l:[l[1][0][0][0],l[1][0][0][1],l[1][0][0][0]-l[1][0][0][1],l[1][0][1],l[1][1]])
df = temp.toDF(['white_rating','black_rating','rating_diff','result','score'])


In [7]:
vectorAssembler = VectorAssembler(inputCols = ['rating_diff','result','score'], outputCol = "features")
df = vectorAssembler.transform(df)
white_df = df.select(['features', 'white_rating'])
black_df = df.select(['features','black_rating'])


splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

lr = LinearRegression(featuresCol = 'features', labelCol='white_rating', maxIter=10)
lr_model_white = lr.fit(train_df)
lr_predictions = lr_model_white.transform(test_df)
lr_predictions.select("prediction","white_rating","features").show(5)


+------------------+------------+--------------------+
|        prediction|white_rating|            features|
+------------------+------------+--------------------+
|1906.2080488812887|        1248|[-706.0,-1.0,-149...|
|1914.4021039104302|        1784|[-684.0,-1.0,-450.0]|
| 1946.628840963238|        1799|[-635.0,-1.0,-484...|
| 1946.104577511845|        1361|[-617.0,-1.0,-521.0]|
|1951.1054312514277|        1805|[-607.0,-1.0,-661.0]|
+------------------+------------+--------------------+
only showing top 5 rows



In [8]:
test_result = lr_model_white.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

Mean Absolute Error on test data = 195.023


In [9]:
print("Coefficients: %s" % str(lr_model_white.coefficients))
print("Intercept: %s" % str(lr_model_white.intercept))



Coefficients: [0.47096720746109827,8.918899342429652,-0.0020798690355112465]
Intercept: 2244.52663209


In [10]:
splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]
lr = LinearRegression(featuresCol = 'features', labelCol='black_rating', maxIter=10)
lr_model_black = lr.fit(train_df)
lr_predictions = lr_model_black.transform(test_df)
lr_predictions.select("prediction","black_rating","features").show(5)

+------------------+------------+--------------------+
|        prediction|black_rating|            features|
+------------------+------------+--------------------+
| 2664.722921080523|        2562|[-814.0,-1.0,-195.0]|
| 2596.936504817829|        2468|[-684.0,-1.0,-450.0]|
|2585.5119826562764|        2547|[-663.0,-1.0,-228.0]|
| 2572.490484188143|        2533|[-638.0,-1.0,-285.0]|
| 2572.646866159907|        2618|[-637.0,-1.0,-663.0]|
+------------------+------------+--------------------+
only showing top 5 rows



In [11]:
test_result = lr_model_black.evaluate(test_df)
print("Mean Absolute Error on test data = %g" % test_result.meanAbsoluteError)

Mean Absolute Error on test data = 199.942


In [12]:
print("Coefficients: %s" % str(lr_model_black.coefficients))
print("Intercept: %s" % str(lr_model_black.intercept))



Coefficients: [-0.5249696788400996,7.330577332016597,-0.001802517594190855]
Intercept: 2244.37668891
