In [1]:
import numpy as np 
import pandas as pd 

In [38]:
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler 

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [4]:
data = pd.read_csv('../input/top-women-chess-players/top_women_chess_players_aug_2020.csv')

In [5]:
data

Unnamed: 0,Fide id,Name,Federation,Gender,Year_of_birth,Title,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag
0,700070,"Polgar, Judit",HUN,F,1976.0,GM,2675,2646.0,2736.0,wi
1,8602980,"Hou, Yifan",CHN,F,1994.0,GM,2658,2621.0,2601.0,
2,5008123,"Koneru, Humpy",IND,F,1987.0,GM,2586,2483.0,2483.0,
3,4147103,"Goryachkina, Aleksandra",RUS,F,1998.0,GM,2582,2502.0,2441.0,
4,700088,"Polgar, Susan",HUN,F,1969.0,GM,2577,,,wi
...,...,...,...,...,...,...,...,...,...,...
8548,3302288,"Reinkens, Natalia",BOL,F,,,1801,,,wi
8549,343960,"Saffova, Michaela",CZE,F,1994.0,,1801,1791.0,1765.0,
8550,5038294,"Shetye, Siddhali",IND,F,1992.0,,1801,1884.0,1824.0,wi
8551,2072491,"Trakru, Priya",USA,F,2001.0,WFM,1801,,,wi


## Preprocessing the data


In [6]:
data.drop(['Fide id' , 'Name' , 'Gender'] ,axis = 1 , inplace = True)

In [7]:
data

Unnamed: 0,Federation,Year_of_birth,Title,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag
0,HUN,1976.0,GM,2675,2646.0,2736.0,wi
1,CHN,1994.0,GM,2658,2621.0,2601.0,
2,IND,1987.0,GM,2586,2483.0,2483.0,
3,RUS,1998.0,GM,2582,2502.0,2441.0,
4,HUN,1969.0,GM,2577,,,wi
...,...,...,...,...,...,...,...
8548,BOL,,,1801,,,wi
8549,CZE,1994.0,,1801,1791.0,1765.0,
8550,IND,1992.0,,1801,1884.0,1824.0,wi
8551,USA,2001.0,WFM,1801,,,wi


## Missing values correction 

In [8]:
data.isnull().any()

Federation         False
Year_of_birth       True
Title               True
Standard_Rating    False
Rapid_rating        True
Blitz_rating        True
Inactive_flag       True
dtype: bool

In [9]:
data.isnull().sum()

Federation            0
Year_of_birth       292
Title              5435
Standard_Rating       0
Rapid_rating       4945
Blitz_rating       5081
Inactive_flag      2701
dtype: int64

In [10]:
data.dtypes

Federation          object
Year_of_birth      float64
Title               object
Standard_Rating      int64
Rapid_rating       float64
Blitz_rating       float64
Inactive_flag       object
dtype: object

In [11]:
numerical_features = ['Year_of_birth' , "Rapid_rating" , 'Blitz_rating']

In [12]:
for col in numerical_features:
    data[col] = data[col].fillna(data[col].mean())
    

In [13]:
data.isnull().sum()

Federation            0
Year_of_birth         0
Title              5435
Standard_Rating       0
Rapid_rating          0
Blitz_rating          0
Inactive_flag      2701
dtype: int64

In [14]:
data['Title'].unique()

array(['GM', 'IM', 'WGM', 'FM', 'WFM', 'WIM', nan, 'CM', 'WCM', 'WH'],
      dtype=object)

In [15]:
data['Inactive_flag'].unique()

array(['wi', nan], dtype=object)

In [16]:
data['Inactive_flag'] = data['Inactive_flag'].fillna("wa")

In [18]:
data.isnull().sum()

Federation            0
Year_of_birth         0
Title              5435
Standard_Rating       0
Rapid_rating          0
Blitz_rating          0
Inactive_flag         0
dtype: int64

In [20]:
title_dummies = pd.get_dummies(data['Title'])
title_dummies

Unnamed: 0,CM,FM,GM,IM,WCM,WFM,WGM,WH,WIM
0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8548,0,0,0,0,0,0,0,0,0
8549,0,0,0,0,0,0,0,0,0
8550,0,0,0,0,0,0,0,0,0
8551,0,0,0,0,0,1,0,0,0


In [21]:
data = pd.concat([data , title_dummies['GM']] , axis = 1)

In [22]:
data

Unnamed: 0,Federation,Year_of_birth,Title,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag,GM
0,HUN,1976.000000,GM,2675,2646.000000,2736.000000,wi,1
1,CHN,1994.000000,GM,2658,2621.000000,2601.000000,wa,1
2,IND,1987.000000,GM,2586,2483.000000,2483.000000,wa,1
3,RUS,1998.000000,GM,2582,2502.000000,2441.000000,wa,1
4,HUN,1969.000000,GM,2577,1931.680155,1925.155242,wi,1
...,...,...,...,...,...,...,...,...
8548,BOL,1985.291732,,1801,1931.680155,1925.155242,wi,0
8549,CZE,1994.000000,,1801,1791.000000,1765.000000,wa,0
8550,IND,1992.000000,,1801,1884.000000,1824.000000,wi,0
8551,USA,2001.000000,WFM,1801,1931.680155,1925.155242,wi,0


In [23]:
data.drop('Title',axis = 1 , inplace = True)

In [24]:
data

Unnamed: 0,Federation,Year_of_birth,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag,GM
0,HUN,1976.000000,2675,2646.000000,2736.000000,wi,1
1,CHN,1994.000000,2658,2621.000000,2601.000000,wa,1
2,IND,1987.000000,2586,2483.000000,2483.000000,wa,1
3,RUS,1998.000000,2582,2502.000000,2441.000000,wa,1
4,HUN,1969.000000,2577,1931.680155,1925.155242,wi,1
...,...,...,...,...,...,...,...
8548,BOL,1985.291732,1801,1931.680155,1925.155242,wi,0
8549,CZE,1994.000000,1801,1791.000000,1765.000000,wa,0
8550,IND,1992.000000,1801,1884.000000,1824.000000,wi,0
8551,USA,2001.000000,1801,1931.680155,1925.155242,wi,0


In [25]:
data.isnull().sum()

Federation         0
Year_of_birth      0
Standard_Rating    0
Rapid_rating       0
Blitz_rating       0
Inactive_flag      0
GM                 0
dtype: int64

## Encoding the data

In [26]:
data['Inactive_flag'].unique()

array(['wi', 'wa'], dtype=object)

In [29]:
le = LabelEncoder()
data['Inactive_flag'] = le.fit_transform(data['Inactive_flag'])

In [30]:
data

Unnamed: 0,Federation,Year_of_birth,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag,GM
0,HUN,1976.000000,2675,2646.000000,2736.000000,1,1
1,CHN,1994.000000,2658,2621.000000,2601.000000,0,1
2,IND,1987.000000,2586,2483.000000,2483.000000,0,1
3,RUS,1998.000000,2582,2502.000000,2441.000000,0,1
4,HUN,1969.000000,2577,1931.680155,1925.155242,1,1
...,...,...,...,...,...,...,...
8548,BOL,1985.291732,1801,1931.680155,1925.155242,1,0
8549,CZE,1994.000000,1801,1791.000000,1765.000000,0,0
8550,IND,1992.000000,1801,1884.000000,1824.000000,1,0
8551,USA,2001.000000,1801,1931.680155,1925.155242,1,0


In [31]:
data['Federation'].unique()

array(['HUN', 'CHN', 'IND', 'RUS', 'UKR', 'LTU', 'GEO', 'KAZ', 'IRI',
       'GER', 'SWE', 'BUL', 'TUR', 'GRE', 'AZE', 'FRA', 'ROU', 'USA',
       'MGL', 'POL', 'BLR', 'QAT', 'ESP', 'ENG', 'INA', 'ARM', 'CZE',
       'PER', 'SRB', 'NED', 'SCO', 'UZB', 'ITA', 'CUB', 'VIE', 'ECU',
       'AUS', 'ARG', 'CRO', 'SVK', 'SGP', 'ISR', 'LUX', 'SLO', 'EST',
       'CAN', 'LAT', 'AUT', 'SUI', 'MNC', 'MDA', 'BRA', 'BEL', 'COL',
       'PHI', 'PAR', 'BRU', 'MEX', 'BIH', 'MAS', 'NOR', 'MNE', 'TKM',
       'IRL', 'VEN', 'EGY', 'IRQ', 'FIN', 'BOL', 'DEN', 'MKD', 'KGZ',
       'ESA', 'CHI', 'RSA', 'FID', 'UAE', 'LBN', 'MYA', 'ISL', 'BAN',
       'POR', 'KSA', 'NAM', 'URU', 'ALG', 'WLS', 'PUR', 'ALB', 'KOR',
       'TJK', 'SRI', 'JAM', 'ANG', 'NGR', 'BAR', 'BER', 'ZIM', 'BOT',
       'JPN', 'DOM', 'CRC', 'SYR', 'GUA', 'SEY', 'JOR', 'NZL', 'MAR',
       'MAC', 'TTO', 'NCA', 'ZAM', 'PAN', 'THA', 'GCI', 'AHO', 'HKG',
       'MLT', 'HON', 'LBA', 'SUR', 'UGA', 'CPV', 'MAD'], dtype=object)

In [32]:
data.drop('Federation' , axis = 1 , inplace= True)

In [33]:
data.isnull().sum()

Year_of_birth      0
Standard_Rating    0
Rapid_rating       0
Blitz_rating       0
Inactive_flag      0
GM                 0
dtype: int64

In [34]:
data

Unnamed: 0,Year_of_birth,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag,GM
0,1976.000000,2675,2646.000000,2736.000000,1,1
1,1994.000000,2658,2621.000000,2601.000000,0,1
2,1987.000000,2586,2483.000000,2483.000000,0,1
3,1998.000000,2582,2502.000000,2441.000000,0,1
4,1969.000000,2577,1931.680155,1925.155242,1,1
...,...,...,...,...,...,...
8548,1985.291732,1801,1931.680155,1925.155242,1,0
8549,1994.000000,1801,1791.000000,1765.000000,0,0
8550,1992.000000,1801,1884.000000,1824.000000,1,0
8551,2001.000000,1801,1931.680155,1925.155242,1,0


In [35]:
y = data['GM']
X = data.drop('GM' , axis = 1)

In [36]:
y

0       1
1       1
2       1
3       1
4       1
       ..
8548    0
8549    0
8550    0
8551    0
8552    0
Name: GM, Length: 8553, dtype: uint8

In [37]:
X

Unnamed: 0,Year_of_birth,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag
0,1976.000000,2675,2646.000000,2736.000000,1
1,1994.000000,2658,2621.000000,2601.000000,0
2,1987.000000,2586,2483.000000,2483.000000,0
3,1998.000000,2582,2502.000000,2441.000000,0
4,1969.000000,2577,1931.680155,1925.155242,1
...,...,...,...,...,...
8548,1985.291732,1801,1931.680155,1925.155242,1
8549,1994.000000,1801,1791.000000,1765.000000,0
8550,1992.000000,1801,1884.000000,1824.000000,1
8551,2001.000000,1801,1931.680155,1925.155242,1


In [39]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X) , columns = X.columns) 

In [40]:
X

Unnamed: 0,Year_of_birth,Standard_Rating,Rapid_rating,Blitz_rating,Inactive_flag
0,0.622222,1.000000,1.000000,1.000000,1.0
1,0.822222,0.980549,0.982419,0.914394,0.0
2,0.744444,0.898169,0.885373,0.839569,0.0
3,0.866667,0.893593,0.898734,0.812936,0.0
4,0.544444,0.887872,0.497665,0.485831,1.0
...,...,...,...,...,...
8548,0.725464,0.000000,0.497665,0.485831,1.0
8549,0.822222,0.000000,0.398734,0.384274,0.0
8550,0.800000,0.000000,0.464135,0.421687,1.0
8551,0.900000,0.000000,0.497665,0.485831,1.0


## Training the model 

In [42]:
X_train , X_test , y_train , y_test = train_test_split(X , y , train_size = 0.8)

In [43]:
model = LogisticRegression() 
model.fit(X_train , y_train)

LogisticRegression()

In [45]:
print(f"Model Accuracy : {model.score(X_test , y_test)}") # we got a high accuracy but lets see f1 score as well

Model Accuracy : 0.9970777323202805


In [49]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [50]:
print(f"Model F1 Score : {f1_score(y_test , y_pred)}" ) # our model is not doing a good job 

Model F1 Score : 0.0


In [51]:
y_test.sum() # -- > number of +ve example

5

In [52]:
len(y) # --> total number of training example

8553

In [53]:
# precentage of GM 
y_test.sum() / len(y)

0.00058459020226821