# Descriptive Statistics - Measures of Central Tendency and Variability

Perform the following operations on any open-source dataset:
1. Provide summary statistics (mean, median, minimum, maximum, standard deviation) for a dataset (age, income etc.) with numeric variables grouped by one of the qualitative (categorical) variable. For example, if your categorical variable is age groups and quantitative variable is income, then provide summary statistics of income grouped by the age groups. Create a list that contains a numeric value for each response to the categorical variable.

3. Write a Python program to display some basic statistical details like percentile, mean, standard deviation etc. of the species of ‘Iris-setosa’, ‘Iris-versicolor’ and ‘Iris- verginica’ of iris.csv dataset.

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

## Part I
Using nba.csv, a dataset of NBA players that includes variables indicating their performance and basic information.

In [58]:
data = pd.read_csv("nba.csv")

In [59]:
data.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [60]:
grouped_data = data.groupby("Team")
grouped_data.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
442,Trevor Booker,Utah Jazz,33.0,PF,28.0,6-8,228.0,Clemson,4775000.0
443,Trey Burke,Utah Jazz,3.0,PG,23.0,6-1,191.0,Michigan,2658240.0
444,Alec Burks,Utah Jazz,10.0,SG,24.0,6-6,214.0,Colorado,9463484.0
445,Dante Exum,Utah Jazz,11.0,PG,20.0,6-6,190.0,,3777720.0


In [61]:
summary_stats = grouped_data.describe(include="number")
print(summary_stats)

                       Number                                                 \
                        count       mean        std  min    25%   50%    75%   
Team                                                                           
Atlanta Hawks            15.0  19.000000  11.476684  0.0  11.50  17.0  25.50   
Boston Celtics           15.0  31.866667  30.300558  0.0   9.50  28.0  42.50   
Brooklyn Nets            15.0  18.266667  14.104035  0.0   8.00  15.0  27.00   
Charlotte Hornets        15.0  17.133333  16.672761  0.0   4.00  12.0  27.50   
Chicago Bulls            15.0  19.200000  17.193022  0.0   5.50  16.0  28.00   
Cleveland Cavaliers      15.0  14.466667  13.809245  0.0   4.50  12.0  21.50   
Dallas Mavericks         15.0  20.000000  16.252472  1.0   6.00  21.0  30.50   
Denver Nuggets           15.0  15.266667  19.655849  0.0   4.00   9.0  18.00   
Detroit Pistons          15.0  17.266667  15.303906  0.0   5.50  13.0  23.50   
Golden State Warriors    15.0  20.866667

## Part II
Using iris.csv, a dataset that includes features of 3 different species of the iris flower.

In [62]:
data = pd.read_csv("iris.csv")

In [63]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [64]:
data.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [65]:
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [66]:
numcols = [col for col in data.columns if (data[col].dtype=="float" or data[col].dtype=="int")]
numcols

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [67]:
numeric = data._get_numeric_data()
numeric

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### Mean

In [68]:
for i in numcols:
    print(i, ": ", data[i].mean())

sepal_length :  5.843333333333334
sepal_width :  3.0540000000000003
petal_length :  3.758666666666666
petal_width :  1.1986666666666668


In [69]:
for i in numcols:
    sum = 0
    for j in range(150):
        sum += data[i][j]
    print(i ,": ", sum/150)

sepal_length :  5.843333333333335
sepal_width :  3.0540000000000007
petal_length :  3.7586666666666693
petal_width :  1.1986666666666672


### Median

In [70]:

for i in numcols:
    print(i ,": ", data[i].median())

sepal_length :  5.8
sepal_width :  3.0
petal_length :  4.35
petal_width :  1.3


In [71]:
for i in numcols:
    sorted_list = sorted(list(data[i]))
    print(i,": ", sorted_list[len(sorted_list)//2])

sepal_length :  5.8
sepal_width :  3.0
petal_length :  4.4
petal_width :  1.3


### Mode

In [72]:
for i in range(5):
    print(data.columns[i], ": ", data.mode().values[0][i])

sepal_length :  5.0
sepal_width :  3.0
petal_length :  1.5
petal_width :  0.2
species :  Iris-setosa


### Percentile

In [74]:
for i in numcols:
    tile = np.percentile(data[i], 25)
    print(i, "25 tile : " ,tile)
    tile = np.percentile(data[i], 50)
    print(i, "50 tile : " ,tile)
    tile = np.percentile(data[i], 75)
    print(i, "75 tile : " ,tile)
    print()

sepal_length 25 tile :  5.1
sepal_length 50 tile :  5.8
sepal_length 75 tile :  6.4

sepal_width 25 tile :  2.8
sepal_width 50 tile :  3.0
sepal_width 75 tile :  3.3

petal_length 25 tile :  1.6
petal_length 50 tile :  4.35
petal_length 75 tile :  5.1

petal_width 25 tile :  0.3
petal_width 50 tile :  1.3
petal_width 75 tile :  1.8



### Variance

In [75]:
for i in numcols:
    variance = np.var(data[i])
    print(i, ": ",variance)

sepal_length :  0.6811222222222223
sepal_width :  0.18675066666666668
petal_length :  3.092424888888889
petal_width :  0.5785315555555555


### Standard Deviation

In [78]:
numeric.std()

sepal_length    0.828066
sepal_width     0.433594
petal_length    1.764420
petal_width     0.763161
dtype: float64

In [79]:
for i in numcols:
    sd = np.std(data[i])
    print(i,": ",sd)

sepal_length :  0.8253012917851409
sepal_width :  0.4321465800705435
petal_length :  1.7585291834055212
petal_width :  0.7606126185881716


### Grouping

In [80]:
grouped_data = data.groupby("species")

In [81]:
grouped_data.mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026


In [82]:
grouped_data.median()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.0,3.4,1.5,0.2
Iris-versicolor,5.9,2.8,4.35,1.3
Iris-virginica,6.5,3.0,5.55,2.0


In [84]:
grouped_data.min()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,4.3,2.3,1.0,0.1
Iris-versicolor,4.9,2.0,3.0,1.0
Iris-virginica,4.9,2.2,4.5,1.4


In [85]:
grouped_data.max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.8,4.4,1.9,0.6
Iris-versicolor,7.0,3.4,5.1,1.8
Iris-virginica,7.9,3.8,6.9,2.5


In [86]:
grouped_data.std()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,0.35249,0.381024,0.173511,0.10721
Iris-versicolor,0.516171,0.313798,0.469911,0.197753
Iris-virginica,0.63588,0.322497,0.551895,0.27465
