Update currently installed packages in your Google Colab Notebook's runtime

In [1]:
!apt-get update -y

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:9

Spark is written in the Scala programming language and requires the Java Virtual Machine (JVM) to run. Therefore, our first task is to download Java.

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

Next, we will download and unzip Apache Spark with Hadoop 2.7 to install it.

In [3]:
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.2-bin-hadoop2.7.tgz

Setup Environment variables for Java and Spark

In [5]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

Then we need to install and import the 'findspark' library that will locate Spark on the system and import it as a regular library.

In [6]:
!pip install -q findspark

import findspark

findspark.init()

Now, import SparkSession from pyspark.sql and create a SparkSession, which will be the entry point to Spark.

In [7]:
from pyspark.sql import SparkSession 

spark = (SparkSession
        .builder
        .appName("datagrokr")
        .getOrCreate())

Download all the files from Google drive link into the content directory of colab notebook.

In [8]:
import gdown

url = "https://drive.google.com/drive/folders/1QgWPHV_l25Ui9L7et8mkZohAOG59UTkQ"
gdown.download_folder(url, quiet=True, use_cookies=False)

['/content/chess/chess_schema.png',
 '/content/chess/chess_wc_history_game_info.csv',
 '/content/chess/chess_wc_history_moves.csv',
 '/content/chess/eco_codes.csv']

Create dataframes for each of the datasets

In [9]:
def head_view(dataframe):
  dataframe.createOrReplaceTempView("tableHead")
  query = "SELECT * FROM tableHead"
  df_head = spark.sql(query)
  df_head.show(truncate=False)

Chess WC History Game Info

In [10]:
df_games = spark.read.load("/content/chess/chess_wc_history_game_info.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

Chess WC History Moves

In [11]:
df_moves = spark.read.load("/content/chess/chess_wc_history_moves.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

WCh knock out(Sub-String with k.o. and KO ) are not included in main event. So, filter the event having Sub-String as k.o and KO in that specific Table Column

In [12]:
df_games_notko = df_games.filter(~df_games.event.contains('k.o') | ~df_games.event.contains('KO'))



### 1️⃣ List of Winners of Each World champions Trophy



In [13]:
import pandas as pd

from pyspark.sql.functions import split

Clean names of players - *select only first_name*

In [14]:
def get_first_name(column):
  return split(df_games_notko[column], ',').getItem(0)

clean_names = ['white', 'black', 'winner', 'loser']
for col in clean_names:
  df_games_notko = df_games_notko.withColumn(col, get_first_name(col))

In [15]:
def get_winner(dataframe):
  white_players = dataframe.select('white').distinct().collect()
  black_players = dataframe.select('black').distinct().collect()

  players = set()
  for player in white_players:
    players.add(player.white)
  for player in black_players:
    players.add(player.black)

  scores = dict.fromkeys(players, 0)
  for game in dataframe.collect():
    if game['result'] == 'draw':
      scores[game['white']] += 1
      scores[game['black']] += 1
    elif game['result'] == '1-0':
      scores[game['white']] += 1
    else:
      scores[game['black']] += 1
      
  return max(scores, key=scores.get)

In [16]:
all_tournaments = df_games_notko.toPandas().tournament_name.unique()

result_list = []
for tour in all_tournaments:
  df_tour = df_games_notko.filter(df_games_notko.tournament_name == tour)
  winner =  get_winner(df_tour)
  result_list.append({'winner': winner, 'tournament_name': tour})

pandas_result = pd.DataFrame(result_list)

In [17]:
df1 = spark.createDataFrame(pandas_result)

### 2️⃣ List of Players with number of times they have won Tournament in descending order

In [18]:
df2 = df1.groupBy('winner').count()

### 3️⃣ Most and Least Popular eco move in world championship history

In [19]:
eco_counts = df_games_notko.groupBy('eco').count().toPandas()

top = eco_counts.sort_values('count').head(1).values[0]
end = eco_counts.sort_values('count').tail(1).values[0]

result_list = []
result_list.append({'eco': end[0], 'eco_name': 'Double King Pawn Games', 'number_of_occurences': end[1]})
result_list.append({'eco': top[0], 'eco_name': 'Sicilian Defence', 'number_of_occurences': top[1]})

pandas_result = pd.DataFrame(result_list)

In [20]:
df3 = spark.createDataFrame(pandas_result)

### 4️⃣ Find the eco move with most winnings

In [21]:
df_games_notdraw = df_games_notko.filter(df_games_notko.result != 'draw')

win_counts = df_games_notdraw.groupBy('eco').count().toPandas()

top = win_counts.sort_values('count').head(1).values[0]

result_list = []
result_list.append({'eco': top[0], 'eco_name': 'Sicilian Defence'})

pandas_result = pd.DataFrame(result_list)

In [22]:
df4 = spark.createDataFrame(pandas_result)

### 5️⃣ Longest and shortest game ever played in a world championship in terms of move

In [23]:
from pyspark.sql.functions import max

In [24]:
def get_moves(game):
  game_moves = df_moves.filter(df_moves.game_id == game)
  return game_moves.agg(max('move_no')).collect()[0][0]

In [None]:
game_moves = []

for game in df_games.collect():
  moves = get_moves(game.game_id)
  game_moves.append({'game_id': game.game_id, 'moves': moves})

pandas_result = pd.DataFrame(game_moves)

In [None]:
longest = pandas_result.query('moves == moves.max()')
shortest = pandas_result.query('moves == moves.min()')

long_game = df_games.filter(df_games.game_id == longest.game_id)
short_game = df_games.filter(df_games.game_id == shortest.game_id)

result_list = []
result_list.append({'game_id': longest.game_id, 'event': long_game.event.collect()[0][0], 'tournament_name': long_game.tournament_name.collect()[0][0], 'number_of_moves': longest.moves})
result_list.append({'game_id': shortest.game_id, 'event': short_game.event.collect()[0][0], 'tournament_name': short_game.tournament_name.collect()[0][0], 'number_of_moves': shortest.moves})

pandas_result = pd.DataFrame(result_list)

In [None]:
df5 = spark.createDataFrame(pandas_result)

### 6️⃣ Shortest and Longest Draw game ever Played

In [None]:
game_moves = []

games_drawn = df_games.filter(df_games.result == 'draw')

for game in games_drawn.collect():
  moves = get_moves(game.game_id)
  game_moves.append({'game_id': game.game_id, 'moves': moves})

pandas_result = pd.DataFrame(game_moves)

In [None]:
longest = pandas_result.query('moves == moves.max()')
shortest = pandas_result.query('moves == moves.min()')

long_game = df_games.filter(df_games.game_id == longest.game_id)
short_game = df_games.filter(df_games.game_id == shortest.game_id)

result_list = []
result_list.append({'game_id': longest.game_id, 'event': long_game.event.collect()[0][0], 'tournament_name': long_game.tournament_name.collect()[0][0], 'number_of_moves': longest.moves})
result_list.append({'game_id': shortest.game_id, 'event': short_game.event.collect()[0][0], 'tournament_name': short_game.tournament_name.collect()[0][0], 'number_of_moves': shortest.moves})

pandas_result = pd.DataFrame(result_list)

In [None]:
df6 = spark.createDataFrame(pandas_result)

### 7️⃣ Most and Least rated Player

In [None]:
def get_rating(player_name):
  # logic - get max of all rating group by player_name
  pass

In [None]:
white_players = df_games.select('white').distinct().collect()
black_players = df_games.select('black').distinct().collect()

unique_players = set()
for player in white_players:
  unique_players.add(player.white)
for player in black_players:
  unique_players.add(player.black)

result_list = []
for player in unique_players:
  rating = get_rating(player)
  result_list.append({'player_name': player_name, 'elo': rating})

pandas_result = pd.DataFrame(result_list)

In [None]:
df7 = spark.createDataFrame(pandas_result)

### 8️⃣ 3rd Last Player with most Loss

In [None]:
#TODO : incomplete
loose_counts = df_games_notdraw.groupBy('loser').count().toPandas()

last_third = loose_counts.sort_values('count').tail(3).values[2]

In [None]:
df8 = last_third.collet()

### 9️⃣ How many times players with low rating won matches with their total win Count

### 1️⃣0️⃣ Move Sequence for Each Player in a Match

# Delete Section 🔽 { *using pandas* }

In [None]:
import pandas as pd

In [None]:
pd_df_games = pd.read_csv("/content/chess/chess_wc_history_game_info.csv")
pd_df_moves = pd.read_csv("/content/chess/chess_wc_history_moves.csv")

In [None]:
# cal number of moves

pd_df_moves[pd_df_moves.game_id == '86e0b7f5-7b94-4ae3-97c8-317371622795'].move_no.max()