## Importing Libraries

In [16]:
%matplotlib inline 

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Loading csv file and reading with pandas

In [17]:
# (encoding = "ISO-8859-1") used to avoid encoding errors.

df = pd.read_csv("playerstatspergame.csv", encoding = "ISO-8859-1" )

In [18]:
df.head() # lets see the top 5 rows in df

Unnamed: 0,Player,username,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,abrinal01,25,OKC,31,2,19.0,1.8,5.1,0.357,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,Quincy Acy,acyqu01,28,PHO,10,0,12.3,0.4,1.8,0.222,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,Jaylen Adams,adamsja01,22,ATL,34,1,12.6,1.1,3.2,0.345,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,Steven Adams,adamsst01,25,OKC,80,80,33.4,6.0,10.1,0.595,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,Bam Adebayo,adebaba01,21,MIA,82,28,23.3,3.4,5.9,0.576,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


## Cleaning data


In [19]:
df.columns # The columns we have in our df

Index(['Player', 'username', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [20]:
# dropping come solumns that we won't use

df = df.drop(['username', 'GS', 'FG', 'FGA', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'STL', 'BLK', 'TOV', 'PF'], axis=1)

In [21]:
df.describe() # to see the summary of statistics pertaining to the DataFrame(df) columns

Unnamed: 0,Age,G,MP,FG%,TRB,AST,PTS
count,708.0,708.0,708.0,702.0,708.0,708.0,708.0
mean,26.142655,42.882768,19.265254,0.437301,3.557345,1.834605,8.384322
std,4.141178,26.282043,9.023905,0.109997,2.488335,1.642428,5.8282
min,19.0,1.0,0.7,0.0,0.0,0.0,0.0
25%,23.0,19.0,12.2,0.4,1.8,0.8,4.0
50%,26.0,44.0,19.1,0.434,3.0,1.3,7.0
75%,29.0,68.0,27.1,0.485,4.625,2.4,11.5
max,42.0,82.0,36.9,1.0,15.6,10.7,36.1


In [22]:
df.info() #information about the dataframe including the data types of each column and memory usage of the entire data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  708 non-null    object 
 1   Age     708 non-null    int64  
 2   Tm      708 non-null    object 
 3   G       708 non-null    int64  
 4   MP      708 non-null    float64
 5   FG%     702 non-null    float64
 6   TRB     708 non-null    float64
 7   AST     708 non-null    float64
 8   PTS     708 non-null    float64
dtypes: float64(5), int64(2), object(2)
memory usage: 49.9+ KB


In [11]:
df[['Player', 'Age', 'Tm', 'G', 'MP', 'FG%', 'TRB', 'AST', 'PTS']].isnull().sum()  #finding whih columns has null values

Player    0
Age       0
Tm        0
G         0
MP        0
FG%       6
TRB       0
AST       0
PTS       0
dtype: int64

In [10]:
# getting rid of rows that has null values
df.dropna(how='any', inplace=True)
len(df)

702

In [11]:
# to see if we have any duplicate entries ( which are most probably because of changing team during mid-season)
number_of_duplicated_names= len(df)-df['Player'].nunique()
number_of_duplicated_names


176

In [17]:
# getting the means of the stats for duplicated names and then dropping them

df = df.groupby(['Player']).agg({'Age' : 'mean', 'G' : 'mean', 'MP' : 'mean', 'FG%' : 'mean' ,
                                  'TRB' : 'mean', 'AST' : 'mean', 'PTS' : 'mean'
                                    })
df.describe()

Unnamed: 0,Age,G,MP,FG%,TRB,AST,PTS
count,526.0,526.0,526.0,526.0,526.0,526.0,526.0
mean,25.904943,46.847592,19.44436,0.442154,3.629167,1.933001,8.662009
std,4.195064,25.7186,9.032682,0.106705,2.527942,1.772416,6.090713
min,19.0,1.0,0.7,0.0,0.0,0.0,0.0
25%,23.0,25.0,12.325,0.40125,1.866667,0.8,4.0125
50%,25.0,49.0,18.9375,0.438,3.1,1.3,7.0
75%,29.0,71.0,27.2,0.49075,4.7,2.5,11.675
max,42.0,82.0,36.9,1.0,15.6,10.7,36.1
