<p style="font-size:18px;">This notebook outlines the implementation of two classes, namely the **LichessAPI** and the **Wrangle** classes. The former class utilizes lichess.org's API to retrieve chess games information for a specified username, generating a PGN file, two CSV files, and a DataFrame of the chess games information. The latter class performs data wrangling on the DataFrame created from reading the CSV files for data analysis.</p>

I used my second lichess account since it has less games so it's faster in testing the code.

## Importings

In [2]:
import pandas as pd
import numpy as np
from converter.pgn_data import PGNData
import warnings, pickle, sys, os, requests
from datetime import datetime
from dateutil import tz

warnings.simplefilter("ignore")
pd.options.display.max_columns = 100

In [3]:
USERNAME = 'XxstardustxX'
TIME_ZONE = 'Egypt/Cairo'

In [4]:
class LichessAPI:
    """
    A class that extract chess games from Lichess.org using the lichess api
    
    ...
    
    Attributes
    ----------
    username : str
        Lichess username
        
    Methods
    -------
    to_pgn()
        Extracts the chess games and exports them into a PGN (Portable Game Notation) file.
        
    to_csv()
        Extracts the chess games and exports them into a PGN and two CSV (Comma-Separated Values) files,
        the first file is for games info and the second file is for moves info.
    
    to_dataframe()
        Extracts the chess games and exports them into a PGN and two CSV files and then reads the games info 
        csv file into a dataframe.
    """
    
    def __init__(self,username):
        """
        Parameters
        ----------
        username : str
            Lichess username
        """
        self.username = username
        
        # Lichess API
        self.url = f"https://lichess.org/api/games/user/{self.username}?tags=true&clocks=true&evals=true&opening=true"
        # Current Date
        now = datetime.now().strftime("%Y-%m-%dT%H.%M.%S")
        # Filename without the PGN extension
        self.filename = f"{now}-{self.username}.pgn"
        # Filename with the PGN extension
        self.filepath = os.path.join("data",f"{self.filename}")
        
        
        
    def to_pgn(self):
        """
        Extracts the chess games and exports them into a PGN (Portable Game Notation) file.
            
        """
        # Extracting the data from the lichess api and export it to a pgn file
        with open(self.filepath,"wb") as f:
            response = requests.get(self.url,stream=True)
            if response.status_code == 200:
                f.write(response.content)
            else:
                raise Exception(f"Username '{self.username}' doesn't exist.")

            
    def to_csv(self):
        """
        Extracts the chess games and exports them into a PGN and two CSV (Comma-Separated Values) files,
        the first file is for games info and the second file is for moves info.
            
        """
        # Call the to_pgn method if it's not called before
        if self.filename not in os.listdir("data"):
            self.to_pgn()
        
        # Converting the PGN file to two csv files, one for games info and the other for moves
        os.chdir("data")
        pgn_data = PGNData(self.filename)
        result = pgn_data.export()
        os.chdir("../")
        
    def to_dataframe(self):
        """
        Extracts the chess games and exports them into a PGN and two CSV files and then reads the games info 
        csv file into a dataframe.

        Returns
        -------
        DataFrame
            A pandas DataFrame of the games info csv file
        
        """
        
        # Remove the .pgn from the filename
        filename = self.filename[:-4]
        csv_file = f"{filename}_game_info.csv"
        
        # Call the to_csv file if it's not called before
        if csv_file not in os.listdir("data"):
            self.to_csv()
        
        # Reading the games info csv file into a pandas DataFrame
        filepath = os.path.join("data",csv_file)
        df = pd.read_csv(filepath)
        return df

In [5]:
# Instantiating a LichessAPI object
my_lichess = LichessAPI(username=USERNAME)

# Creating a DataFrame
df = my_lichess.to_dataframe()

INFO:pgn2data - log_time:initializing at...2023-03-03 20:29:04.606599
INFO:pgn2data - pgn_data class:Starting process..
INFO:pgn2data - process:Processing file:2023-03-03T22.28.56-XxstardustxX.pgn
INFO:pgn2data - pgn_data class:ending process..
INFO:pgn2data - log_time:time taken sec: 5.846999899999901 sec
INFO:pgn2data - log_time:time taken: 5.846999899999901 seconds, 
INFO:pgn2data - log_time:time started...2023-03-03 20:29:04.606599
INFO:pgn2data - log_time:time ended.....2023-03-03 20:29:10.465156


In [6]:
df.head()

Unnamed: 0,game_id,game_order,event,site,date_played,round,white,black,result,white_elo,white_rating_diff,black_elo,black_rating_diff,white_title,black_title,winner,winner_elo,loser,loser_elo,winner_loser_elo_diff,eco,termination,time_control,utc_date,utc_time,variant,ply_count,date_created,file_name
0,04fd3d3c-fa23-41a6-a935-cdcf3bf747ff,1,Hourly Bullet Arena,https://lichess.org/tthH36pL,2022.01.14,?,XxStardustxX,pockelmann,0-1,2012,-34.0,1351,11.0,,,pockelmann,1351.0,XxStardustxX,2012.0,-661,?,Abandoned,60+0,2022.01.14,23:13:45,Standard,,2023-03-03T20:29:04+0000,2023-03-03T22.28.56-XxstardustxX.pgn
1,cae532e6-edb2-4ca3-bfb7-d02ea0e5bb18,2,Hourly Bullet Arena,https://lichess.org/EJxIgi25,2022.01.14,?,XxStardustxX,Der_Foerster,0-1,2047,-35.0,1420,11.0,,,Der_Foerster,1420.0,XxStardustxX,2047.0,-627,?,Abandoned,60+0,2022.01.14,23:12:47,Standard,,2023-03-03T20:29:04+0000,2023-03-03T22.28.56-XxstardustxX.pgn
2,c6914767-84b1-471b-a4e4-b721728cd571,3,Hourly Bullet Arena,https://lichess.org/bfjUu3HR,2022.01.14,?,thehottestsoup,XxStardustxX,0-1,1080,0.0,2047,0.0,,,XxStardustxX,2047.0,thehottestsoup,1080.0,967,C52,Normal,60+0,2022.01.14,23:10:53,Standard,,2023-03-03T20:29:04+0000,2023-03-03T22.28.56-XxstardustxX.pgn
3,729b2fc0-15c0-4ad6-996b-50819131588e,4,Hourly Bullet Arena,https://lichess.org/5x4d6aJh,2022.01.14,?,XxStardustxX,Faktotum,0-1,2063,-16.0,2098,5.0,,,Faktotum,2098.0,XxStardustxX,2063.0,35,D02,Normal,60+0,2022.01.14,23:08:43,Standard,,2023-03-03T20:29:04+0000,2023-03-03T22.28.56-XxstardustxX.pgn
4,fa4fdfd8-5191-4fda-8f52-91249c7c6d9b,5,Hourly Bullet Arena,https://lichess.org/Y0f9hha4,2022.01.14,?,elrincondelopez,XxStardustxX,1-0,2217,4.0,2075,-12.0,,,elrincondelopez,2217.0,XxStardustxX,2075.0,142,B00,Normal,60+0,2022.01.14,23:07:30,Standard,,2023-03-03T20:29:04+0000,2023-03-03T22.28.56-XxstardustxX.pgn


In [7]:
class Wrangle:
    """
    A class that cleans the chess games info DataFrame for analysis
    
    ...
    
    Attributes
    -----------
    df : DataFrame
        The DataFrame that contains the chess games info
        
    Methods
    -------
    wrangle(time_zone="Egypt/Cairo")
        Cleans the data from analysis and visualizations

    """
    
    def __init__(self,df,username):
        """
        Parameters
        ----------
        df : DataFrame
            The DataFrame that contains the chess games info
            
        username : String
            Lichess username
        """
        self.df = df
        self.username = username
        
    def __utc_to_localtime(self,utcdatetime,time_zone='Egypt/Cairo'):
        """
        Converts the UTC datetime to Cairo datetime
        
        If argument 'time_zone' isn't passed in, the default timezone is "Egypt/Cairo"
        
        Parameters
        ----------
        utcdatetime : str
            A utc datetime string in this format '%Y.%m.%d %H:%M:%S'
            
        Returns
        -------
        Datetime
            A datetime in the timezone specified
        """
        from_zone = tz.gettz('UTC')
        to_zone = tz.gettz(time_zone)
        utc = datetime.strptime(utcdatetime, '%Y.%m.%d %H:%M:%S')
        utc = utc.replace(tzinfo=from_zone)
        local = utc.astimezone(to_zone)
        return local
    
    def __chess_type(self,time_control):
        """
        This function takes the time control and return the chess game type

        Parameters
        ----------
        time_control : str
            the time control of the game

        Returns
        -------
        Str
            the chess game type
        """
        # "-" indicates that the game had no time control
        if time_control == '-':
            return "Classical"

        # Extracting the seconds from the string
        plus_index = time_control.index("+")
        seconds = int(time_control[:plus_index])

        if seconds in range(0,30):
            return "UltraBullet"

        if seconds in range(30,180):
            return "Bullet"

        if seconds in range(180,600):
            return "Blitz"

        if seconds in range(600,1800):
            return "Rapid"

        if seconds >= 1800:
            return "Classical"
        
    
    def wrangle(self,time_zone='Egypt/Cairo'):
        """
        This method cleans the DataFrame from analysis and visualizations
        
        If argument 'time_zone' isn't passed in, the default timezone is "Egypt/Cairo"
        
        Parameters
        ----------
        utcdatetime : str
            A utc datetime string in this format '%Y.%m.%d %H:%M:%S'
            
        Returns
        -------
        DataFrame
            A clean DataFrame that is ready for analysis and visualizations
        """
        
        # Read the openings csv file
        openings = pd.read_csv("./data/Chess Opening Reference - Sheet1.csv")

        # Columns to drop
        columns_to_drop = ['game_order','round','ply_count','file_name','date_played','utc_date','utc_time','date_created',
                           'datetime_utc','datetime_cairo','white','black','winner','winner_elo','loser','loser_elo','winner_loser_elo_diff',
                           'event','white_elo','black_elo','white_rating_diff','black_rating_diff','white_title','black_title','eco']

        # Games types
        self.df['game_type'] = self.df['event'].apply(lambda x:'Casual' if "Casual" in x else 'Rated')
        self.df['chess_type'] = self.df['time_control'].apply(lambda x: self.__chess_type(x))
        self.df['in_tournament'] = self.df['event'].apply(lambda x:0 if 'game' in x else 1)

        # Fill the nulls
        to_fill = {"white_rating_diff":"0","black_rating_diff":"0",
                   "white_title":"None","black_title":"None"}
        self.df.fillna(to_fill,inplace=True)

        # Colors and Usernames
        self.df['color'] = self.df.apply(lambda x:'white' if x.white == self.username else "black",axis=1)
        self.df['opponent_username'] = self.df.apply(lambda x:x.white if x.color == 'black' else x.black,axis=1)
        self.df['opponent_title'] = self.df.apply(lambda x:x.white_title if x.color == 'black' else x.black_title,axis=1)

        # Elo and ratings
        mask_1 = self.df['white_elo'] != '?'
        mask_2 = self.df['black_elo'] != "?"
        self.df = self.df[mask_1 | mask_2]
        self.df['my_elo'] = self.df.apply(lambda x:x.white_elo if x.color == 'white' else x.black_elo,axis=1).astype(int)
        self.df['opponent_elo'] = self.df.apply(lambda x:x.black_elo if x.color == 'white' else x.white_elo,axis=1).astype(int)
        self.df['elo_diff'] = self.df['my_elo'].astype(int) - self.df['opponent_elo'].astype(int)
        self.df[['white_rating_diff','black_rating_diff']] = self.df[['white_rating_diff','black_rating_diff']].astype(int) 
        self.df['rating_gained'] = self.df.apply(lambda x:x.white_rating_diff if x.white_rating_diff > 0 and x.color == 'white' else
                                      (x.black_rating_diff if x.black_rating_diff > 0 and x.color == 'black' else 0),axis=1)
        self.df['rating_lost'] = self.df.apply(lambda x:-(x.white_rating_diff) if x.white_rating_diff < 0 and x.color == 'white' else
                                      (-(x.black_rating_diff) if x.black_rating_diff < 0 and x.color == 'black' else 0),axis=1)

        # Games result and opening
        self.df['result'] = self.df.apply(lambda x:'win' if x.winner == self.username else ('loss' if x.loser == self.username else 'draw'),axis=1)
        self.df['opening'] = self.df['eco'].apply(lambda x:openings[openings['ECO Code'] == x]['Name'].values[0] if x != "?" else "Unknown")
        
        # Date and time
        self.df['datetime_utc'] = self.df['utc_date'] + " " + self.df['utc_time']
        self.df['datetime_cairo'] = self.df['datetime_utc'].apply(lambda x:self.__utc_to_localtime(x,time_zone=TIME_ZONE))
        self.df['date'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%D"))
        self.df['year'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%Y"))
        self.df['month'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%b"))
        self.df['day'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%d"))
        self.df['day_of_week'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%A"))
        self.df['hour'] = self.df['datetime_cairo'].apply(lambda x:x.strftime("%H"))
        self.df['date'] = self.df['date'].astype(np.datetime64)

        # Drop the unnecessary columns
        self.df.drop(columns_to_drop,axis=1,inplace=True)
        
        # Reset the game_id column
        self.df['game_id'] = np.arange(0,self.df.shape[0])

        return self.df

In [9]:
# Instantiating a Wrangle object
wrangler = Wrangle(df,USERNAME)

# Clean the data for analysis
df_analysis = wrangler.wrangle()
print(f"Analysis DataFrame shape: {df_analysis.shape}")

Analysis DataFrame shape: (152, 24)


In [10]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 0 to 151
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   game_id            152 non-null    int32         
 1   site               152 non-null    object        
 2   result             152 non-null    object        
 3   termination        152 non-null    object        
 4   time_control       152 non-null    object        
 5   variant            152 non-null    object        
 6   game_type          152 non-null    object        
 7   chess_type         152 non-null    object        
 8   in_tournament      152 non-null    int64         
 9   color              152 non-null    object        
 10  opponent_username  152 non-null    object        
 11  opponent_title     152 non-null    object        
 12  my_elo             152 non-null    int32         
 13  opponent_elo       152 non-null    int32         
 14  elo_diff  