In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [5]:
#loading player allstar data
player_allstar_data = pd.read_csv('player_allstar.txt', header=0)
print(player_allstar_data)

           ilkid  year firstname      lastname conference leag  gp  minutes  \
0     AbdulKa01   1978     Karem  Abdul-Jabbar       west    N   1       28   
1     AbdulKa01   1969     Karem  Abdul-Jabbar       east    N   1       18   
2     AbdulKa01   1988    Kareem  Abdul-Jabbar       west    N   1       13   
3     AbdulKa01   1987    Kareem  Abdul-Jabbar       west    N   1       14   
4     AbdulKa01   1986    Kareem  Abdul-Jabbar       west    N   1       27   
...          ...   ...       ...           ...        ...  ...  ..      ...   
1457  YardlGe01   1956    George       Yardley       west    N   1       25   
1458  YardlGe01   1955    George       Yardley       west    N   1       19   
1459  YardlGe01   1954    George       Yardley       west    N   1       22   
1460  YardlGe01   1959    George       Yardley       east    N   1       16   
1461  ZasloMa01   1951       Max     Zaslofsky       east    N   1       25   

       pts  dreb  ...  stl  blk  turnover  pf   fga

In [6]:
print(player_allstar_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ilkid       1462 non-null   object 
 1   year        1462 non-null   int64  
 2   firstname   1462 non-null   object 
 3   lastname    1462 non-null   object 
 4   conference  1462 non-null   object 
 5   leag        1462 non-null   object 
 6   gp          1462 non-null   int64  
 7   minutes     1462 non-null   int64  
 8   pts         1414 non-null   float64
 9   dreb        323 non-null    float64
 10  oreb        323 non-null    float64
 11  reb         1414 non-null   float64
 12  asts        1414 non-null   float64
 13  stl         228 non-null    float64
 14  blk         228 non-null    float64
 15  turnover    323 non-null    float64
 16  pf          370 non-null    float64
 17  fga         1414 non-null   float64
 18  fgm         1414 non-null   float64
 19  fta         1414 non-null  

In [7]:
#checking if any duplicate rows, return True if any are duplicate
player_allstar_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1457    False
1458    False
1459    False
1460    False
1461    False
Length: 1462, dtype: bool

In [8]:
#summing up the duplicate rows, here there are 0 duplicate rows
player_allstar_data.duplicated().sum() 

0

In [9]:
#dropping null values
# The first line of code drops all the rows that have null values in any of their columns. 
# The axis=0 argument specifies that rows (not columns) should be dropped, and the how='any' 
# argument specifies that any row with at least one null value should be dropped.
df_records_dropped = player_allstar_data.dropna(axis=0, how='any')
df_records_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228 entries, 19 to 1447
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ilkid       228 non-null    object 
 1   year        228 non-null    int64  
 2   firstname   228 non-null    object 
 3   lastname    228 non-null    object 
 4   conference  228 non-null    object 
 5   leag        228 non-null    object 
 6   gp          228 non-null    int64  
 7   minutes     228 non-null    int64  
 8   pts         228 non-null    float64
 9   dreb        228 non-null    float64
 10  oreb        228 non-null    float64
 11  reb         228 non-null    float64
 12  asts        228 non-null    float64
 13  stl         228 non-null    float64
 14  blk         228 non-null    float64
 15  turnover    228 non-null    float64
 16  pf          228 non-null    float64
 17  fga         228 non-null    float64
 18  fgm         228 non-null    float64
 19  fta         228 non-null   

In [10]:
print(df_records_dropped)

           ilkid  year firstname  lastname conference leag  gp  minutes   pts  \
19    ABDURSH01   2001   Shareef   A-Rahim       east    N   1       21   9.0   
26    AllenRa02   2001       Ray     Allen       east    N   1       25  15.0   
27    AllenRa02   2000       Ray     Allen       east    N   1       19  15.0   
28    AllenRa02   1999       Ray     Allen       east    N   1       17  14.0   
29    AllenRa02   2003       Ray     Allen       west    N   1       23  16.0   
...          ...   ...       ...       ...        ...  ...  ..      ...   ...   
1380  WebbeCh01   1996     Chris    Webber       east    N   1       14   2.0   
1438  WilliCh02   1972     Chuck  Williams       west    A   1       16   5.0   
1441  WilliJa01   1997    Jayson  Williams       east    N   1       19   4.0   
1446  WiseWi01    1972    Willie      Wise       west    A   1       37  26.0   
1447  WiseWi01    1973    Willie      Wise       west    A   1       25   8.0   

      dreb  ...  stl  blk  

In [12]:
# Group the data by first name and last name, and take the mean of the other features
df_grouped = df_records_dropped.groupby(['firstname', 'lastname', 'ilkid']).mean().reset_index()
print(df_grouped)

    firstname   lastname       ilkid    year   gp  minutes   pts  dreb  oreb  \
0       Allan    Houston  HoustAl01   1999.5  1.0     16.5   8.0   1.5   0.0   
1       Allen    Iverson  IversAl01   2001.0  1.0     28.8  18.8   1.6   1.2   
2      Alonzo   Mourning  MournAl01   2000.0  1.0     21.5  14.0   3.5   1.5   
3      Andrei  Kirilenko  KirilAn01   2003.0  1.0     12.0   2.0   1.0   0.0   
4    Anfernee   Hardaway  HardaAn01   1997.0  1.0     12.0   6.0   0.0   0.0   
..        ...        ...         ...     ...  ...      ...   ...   ...   ...   
104    Warren     Jabali  JabalWa01   1972.5  1.0     27.5  11.0   1.5   1.5   
105    Willie       Wise  WiseWi01    1972.5  1.0     31.0  17.0   2.5   4.0   
106       Yao       Ming  MingYa01    2002.5  1.0     17.5   9.0   2.0   1.0   
107     Zelmo      Beaty  BeatyZe01   1972.0  1.0     15.0   6.0   4.0   0.0   
108  Zydrunas  Ilgauskas  IlgauZy01   2002.0  1.0      4.0   0.0   0.0   0.0   

     reb  ...  stl  blk  turnover   pf 

In [13]:
# Save the grouped data to a new text file
df_grouped.to_csv('player_allstar_grouped.txt', index=False)