In [1]:
import pyspark
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
os.chdir('../Project/finding-elo/')
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [3]:
games = sc.textFile('data_uci.pgn')
games = games.map(lambda l:l.rstrip())

In [4]:
def myfunc(a):
    if "1/2" in a:
        return 0
    if "1-0" in a:
        return 1
    else:
        return -1

def eval_moves(moves):
    score = 0
    for i in range(len(moves)-1):
        score = score + moves[i+1] - moves[i]
    return score
        

In [5]:
result = games.filter(lambda l:"Result" in l).map(lambda l: myfunc(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))
whiteelo = games.filter(lambda l: "WhiteElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
blackelo = games.filter(lambda l: "BlackElo" in l).map(lambda l:int(l.split()[1][1:5])).zipWithIndex().map(lambda l:(l[1],l[0]))
stockfish = sc.textFile('stockfish.csv').filter(lambda l: 'Event' not in l).map(lambda l:[0]+[int(x) for x in l.split(',')[1].split(" ") if x!=''])\
.map(lambda l: eval_moves(l)).zipWithIndex().filter(lambda vi: vi[1] < 25000).map(lambda l:(l[1],l[0]))


In [6]:
temp = whiteelo.join(blackelo).join(result).join(stockfish).map(lambda l:[l[1][0][0][0],l[1][0][0][1],l[1][0][0][0]-l[1][0][0][1],l[1][0][1],l[1][1]])
df = temp.toDF(['white_rating','black_rating','rating_diff','result','score'])


In [7]:
vectorAssembler = VectorAssembler(inputCols = ['rating_diff','result','score'], outputCol = "features")
df = vectorAssembler.transform(df)
white_df = df.select(['features', 'white_rating'])
black_df = df.select(['features','black_rating'])


splits = white_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

rf = RandomForestRegressor(featuresCol="features",labelCol='white_rating',maxDepth=25,numTrees=25)
rf_model_white = rf.fit(train_df)
rf_predictions = rf_model_white.transform(test_df)
rf_predictions.select("prediction","white_rating","features").show(5)


+------------------+------------+--------------------+
|        prediction|white_rating|            features|
+------------------+------------+--------------------+
|2012.5886047653905|        1748|[-814.0,-1.0,-195.0]|
|2012.5886047653905|        1884|[-663.0,-1.0,-228.0]|
|1975.8978447593793|        1805|[-607.0,-1.0,-661.0]|
|1946.0484863410352|        2010| [-587.0,1.0,2235.0]|
|1972.1565581739135|        1641|[-559.0,-1.0,-502.0]|
+------------------+------------+--------------------+
only showing top 5 rows



In [8]:
evaluator = RegressionEvaluator(
    labelCol="white_rating", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(rf_predictions)
print("MAE on test data = %g" % mae)

MAE on test data = 184.098


In [None]:
splits = black_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

rf = RandomForestRegressor(featuresCol="features",labelCol='black_rating',maxDepth=25,numTrees=25)
rf_model_black = rf.fit(train_df)
rf_predictions = rf_model_black.transform(test_df)
rf_predictions.select("prediction","black_rating","features").show(5)



In [None]:
evaluator = RegressionEvaluator(
    labelCol="black_rating", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(rf_predictions)
print("MAE on test data = %g" % mae)