# Medidas de posição e dispersão

### Base de dados

In [1]:
import numpy as np
import statistics
from scipy import stats
import math
import pandas as pd

In [67]:
dados = np.array([150,151,152,152,153,154,155,155,155,155,156,156,
                  157,158,159,158,160,160,160,160,160,161,161,161,161,
                  162,163,163,164,164,164,165,166,167,168,168,169,
                  170,172,173])

### Média aritmética simples

In [29]:
dados.sum() / len(dados)

160.45

In [30]:
dados.mean()

160.45

In [31]:
statistics.mean(dados)

160

### Moda

In [32]:
statistics.mode(dados)

160

In [33]:
stats.mode(dados)

ModeResult(mode=array([160]), count=array([5]))

### Mediana

In [10]:
dados_impar = [150,151,152,152,153,154,155,155,155]

#### Cálculo ímpar

In [35]:
posicao = len(dados_impar) / 2
posicao

4.5

In [36]:
posicao = math.ceil(posicao)
posicao

5

In [37]:
dados_impar[posicao - 1]

153

#### Cálculo par

In [38]:
posicao = len(dados) / 2
posicao = math.ceil(posicao)
posicao

20

In [40]:
dados[posicao - 1], dados[posicao]

(160, 160)

In [39]:
(dados[posicao - 1] + dados[posicao]) / 2

160.0

#### Biblioteca

In [41]:
np.median(dados_impar)

153.0

In [42]:
np.median(dados)

160.0

In [43]:
statistics.median(dados_impar)

153

In [44]:
statistics.median(dados)

160.0

### Média aritmética ponderada

In [46]:
notas = np.array([9,8,7,3])
pesos = np.array([1,2,3,4])

In [47]:
media_ponderada = (notas * pesos).sum() / pesos.sum()
media_ponderada

5.8

In [48]:
np.average(notas, weights=pesos)

5.8

### Média aritmética, moda, mediana com distribuição de frequência(dados agrupados)

In [40]:
dados = {'inferior': [150,154,158,162,166,170],
         'superior': [154,158,162,166,170,174],
         'fi'      : [5,9,11,7,5,3]}

In [41]:
df = pd.DataFrame(dados)
df

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [42]:
df['xi'] = (df['superior'] + df['inferior']) / 2
df

Unnamed: 0,inferior,superior,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


In [43]:
df['fixi'] = df['fi'] * df['xi']
df

Unnamed: 0,inferior,superior,fi,xi,fixi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [44]:
df['Fi'] = 0
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
0,150,154,5,152.0,760.0,0
1,154,158,9,156.0,1404.0,0
2,158,162,11,160.0,1760.0,0
3,162,166,7,164.0,1148.0,0
4,166,170,5,168.0,840.0,0
5,170,174,3,172.0,516.0,0


In [45]:
frequencia_acumulada = []
somatorio = 0
for linha in df.iterrows():
    somatorio += linha[1][2]
    frequencia_acumulada.append(somatorio)

In [46]:
frequencia_acumulada

[5.0, 14.0, 25.0, 32.0, 37.0, 40.0]

In [47]:
df['Fi'] = frequencia_acumulada
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


### Média

In [48]:
df['fixi'].sum() / df['fi'].sum()

160.7

### Moda

In [49]:
df['fi'].max()

11

In [50]:
df[df['fi'] == df['fi'].max()]

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
2,158,162,11,160.0,1760.0,25.0


In [51]:
df[df['fi'] == df['fi'].max()]['xi'].values[0]

160.0

### Mediana

In [52]:
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [53]:
fi_2 = df['fi'].sum() / 2
fi_2

20.0

In [54]:
limite_inferior, frequencia_classe, id_frequencia_anterior = 0,0,0
for linha in df.iterrows():
    limite_inferior = linha[1][0]
    frequencia_classe = linha[1][2]
    id_frequencia_anterior = linha[0]
    if linha[1][5] >= fi_2:
        id_frequencia_anterior -=1
        break

In [55]:
limite_inferior, frequencia_classe, id_frequencia_anterior 

(158.0, 11.0, 1)

In [56]:
Fi_anterior = df.iloc[[id_frequencia_anterior]]['Fi'].values[0]
Fi_anterior

14.0

In [57]:
mediana = limite_inferior + ((fi_2 - Fi_anterior) * 4) / frequencia_classe  
mediana

160.1818181818182

### Média Geométrica, harmônica e quadrática

#### Média geométrica

In [58]:
from scipy.stats.mstats import gmean

In [68]:
gmean(dados)

160.34592323257766

#### Média harmônica

In [None]:
from scipy.stats.mstats import hmean

In [None]:
hmean(dados)

#### Média quadrática

In [60]:
def quadratic_mean(dados):
    return math.sqrt(sum(n * n for n in dados) / len(dados))

In [69]:
quadratic_mean(dados)

160.5545078781658

### Quartis

In [83]:
dados_impar

[150, 151, 152, 152, 153, 154, 155, 155, 155]

#### Cálculo manual

In [84]:
np.median(dados_impar)

153.0

In [85]:
posicao_mediana = len(dados_impar) / 2
posicao_mediana = math.floor(posicao_mediana)
posicao_mediana

4

In [86]:
esquerda = dados_impar[0:posicao_mediana]
esquerda

[150, 151, 152, 152]

In [87]:
np.median(esquerda)

151.5

In [88]:
direita = dados_impar[posicao_mediana + 1:]
direita

[154, 155, 155, 155]

In [89]:
np.median(direita)

155.0

### Bibliotecas

In [90]:
np.quantile(dados_impar, 0.5)

153.0

In [91]:
np.quantile(dados_impar, 0.75)

155.0

In [92]:
np.quantile(dados_impar, 0.25)

152.0

In [93]:
np.quantile(dados, 0.25)

155.75

In [94]:
np.quantile(dados, 0.5)

160.0

In [95]:
np.quantile(dados, 0.75)

164.0

### scipy

In [96]:
stats.scoreatpercentile(dados, 25)

155.75

In [97]:
stats.scoreatpercentile(dados, 50)

160.0

In [98]:
stats.scoreatpercentile(dados, 75)

164.0

In [100]:
df_dados = pd.DataFrame(dados)
df_dados

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153
5,154
6,155
7,155
8,155
9,155


In [103]:
df_dados.quantile([0.25,0.5,0.75])

Unnamed: 0,0
0.25,155.75
0.5,160.0
0.75,164.0


In [104]:
df_dados.describe()

Unnamed: 0,0
count,40.0
mean,160.45
std,5.865807
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


### Quartis com distribuição de frequência (dados agrupados)

In [105]:
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [106]:
def get_quartil(df, q1 = True):
    if q1 == True:
        fi_4 = df['fi'].sum() / 4
    else:
        fi_4 = (3 * df['fi'].sum()) / 4
    
    limite_inferior, frequencia_classe, id_frequencia_anterior = 0,0,0
    for linha in df.iterrows():
        limite_inferior = linha[1][0]
        frequencia_classe = linha[1][2]
        id_frequencia_anterior = linha[0]
        if linha[1][5] >= fi_4:
            id_frequencia_anterior -=1
            break
    Fi_anterior = df.iloc[[id_frequencia_anterior]]['Fi'].values[0]
    q = limite_inferior + ((fi_4 - Fi_anterior) * 4) / frequencia_classe
    return q

In [108]:
get_quartil(df)

156.22222222222223

In [109]:
get_quartil(df, False)

164.85714285714286

### Percentis

In [110]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 157,
       158, 159, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [111]:
np.median(dados)

160.0

In [112]:
np.quantile(dados, 0.05)

151.95

In [115]:
np.percentile(dados, 5), np.percentile(dados, 10),np.percentile(dados, 90)

(151.95, 152.9, 168.1)

In [116]:
stats.scoreatpercentile(dados, 5), stats.scoreatpercentile(dados, 10), stats.scoreatpercentile(dados, 90)

(151.95000000000002, 152.89999999999998, 168.1)

In [118]:
df_dados.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [120]:
df_dados.quantile([0.05,.1,0.9])

Unnamed: 0,0
0.05,151.95
0.1,152.9
0.9,168.1


### Medidas de dispersão

In [65]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 157,
       158, 159, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

#### Amplitude total e diferença interquartil

In [5]:
dados.max() - dados.min()

23

In [6]:
q1 = np.quantile(dados, .25)
q3 = np.quantile(dados, .75)
q1, q3

(155.75, 164.0)

In [7]:
diferenca_interquartil = q3 - q1
diferenca_interquartil

8.25

In [8]:
inferior = q1 - (1.5 * diferenca_interquartil)
inferior

143.375

In [9]:
superior = q3 + (1.5 * diferenca_interquartil)
superior

176.375

#### Variância e Desvio Padrão

In [11]:
dados_impar

[150, 151, 152, 152, 153, 154, 155, 155, 155]

In [13]:
media = np.mean(dados_impar)
media

153.0

In [15]:
desvio = abs(dados_impar - media)
desvio

array([3., 2., 1., 1., 0., 1., 2., 2., 2.])

In [16]:
desvio = desvio ** 2
desvio

array([9., 4., 1., 1., 0., 1., 4., 4., 4.])

In [17]:
soma_desvio = desvio.sum()

In [18]:
v = soma_desvio / len(dados_impar)
v

3.111111111111111

In [19]:
variancia = np.var(dados_impar)
variancia

3.111111111111111

In [21]:
np.var(dados)

33.5475

In [22]:
statistics.variance(dados)

34

In [23]:
from scipy import ndimage
ndimage.variance(dados)

33.5475

In [30]:
dp = math.sqrt(variancia)
dp

1.7638342073763937

In [26]:
np.std(dados_impar)

1.7638342073763937

In [27]:
np.std(dados)

5.792020372892347

In [29]:
statistics.stdev(dados)

5.830951894845301

#### Coeficiente de variação

In [31]:
cv = (dp/media) * 100
cv

1.1528328152786886

In [32]:
dp_dados = np.std(dados)

In [33]:
cv_dados = (dp_dados/np.mean(dados)) * 100
cv_dados

3.609860001802647

In [37]:
stats.variation(dados_impar) * 100

1.1528328152786886

In [39]:
stats.variation(dados) * 100

3.609860001802647

In [66]:
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [70]:
df['xi2'] = df['xi'] ** 2
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi,xi2
0,150,154,5,152.0,760.0,5.0,23104.0
1,154,158,9,156.0,1404.0,14.0,24336.0
2,158,162,11,160.0,1760.0,25.0,25600.0
3,162,166,7,164.0,1148.0,32.0,26896.0
4,166,170,5,168.0,840.0,37.0,28224.0
5,170,174,3,172.0,516.0,40.0,29584.0


In [71]:
df['fi.xi2'] = df['fi'] * df['xi2']
df

Unnamed: 0,inferior,superior,fi,xi,fixi,Fi,xi2,fi.xi2
0,150,154,5,152.0,760.0,5.0,23104.0,115520.0
1,154,158,9,156.0,1404.0,14.0,24336.0,219024.0
2,158,162,11,160.0,1760.0,25.0,25600.0,281600.0
3,162,166,7,164.0,1148.0,32.0,26896.0,188272.0
4,166,170,5,168.0,840.0,37.0,28224.0,141120.0
5,170,174,3,172.0,516.0,40.0,29584.0,88752.0


In [73]:
df.columns

Index(['inferior', 'superior', 'fi', 'xi', 'fixi', 'Fi', 'xi2', 'fi.xi2'], dtype='object')

In [74]:
colunas_ordenadas = ['inferior', 'superior', 'fi', 'xi', 'fixi', 'xi2', 'fi.xi2', 'Fi']

In [75]:
df = df[colunas_ordenadas]
df

Unnamed: 0,inferior,superior,fi,xi,fixi,xi2,fi.xi2,Fi
0,150,154,5,152.0,760.0,23104.0,115520.0,5.0
1,154,158,9,156.0,1404.0,24336.0,219024.0,14.0
2,158,162,11,160.0,1760.0,25600.0,281600.0,25.0
3,162,166,7,164.0,1148.0,26896.0,188272.0,32.0
4,166,170,5,168.0,840.0,28224.0,141120.0,37.0
5,170,174,3,172.0,516.0,29584.0,88752.0,40.0


In [76]:
dp = math.sqrt(df['fi.xi2'].sum() / df['fi'].sum() - math.pow(df['fixi'].sum() / df['fi'].sum(), 2))

In [77]:
dp

5.719265687131764

### Testes com algoritmos de classificação

In [2]:
df = pd.read_csv('../../dados/credit_data.csv')

In [3]:
df.dropna(inplace=True)
df.shape

(1997, 5)

In [4]:
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
X = df.iloc[:,1:4].values
y = df.iloc[:,-1].values

In [6]:
X.shape

(1997, 3)

In [7]:
y.shape

(1997,)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes  import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [14]:
resultados_naive_bayes = []
resultados_logistica = []
resultados_forest = []
for i in range(30):
    X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_treino, y_treino)
    resultados_naive_bayes.append(accuracy_score(y_teste, naive_bayes.predict(X_teste)))
    
    logistica = LogisticRegression()
    logistica.fit(X_treino, y_treino)
    resultados_logistica.append(accuracy_score(y_teste, logistica.predict(X_teste)))
    
    forest = RandomForestClassifier()
    forest.fit(X_treino, y_treino)
    resultados_forest.append(accuracy_score(y_teste, forest.predict(X_teste)))

In [15]:
print(resultados_forest)

[0.975, 0.9875, 0.9875, 0.9925, 0.98, 0.9825, 0.9875, 0.98, 0.9775, 0.985, 0.9825, 0.9775, 0.975, 0.985, 0.98, 0.9725, 0.985, 0.9825, 0.9925, 0.985, 0.98, 0.9775, 0.9825, 0.985, 0.985, 0.995, 0.9925, 0.985, 0.985, 0.99]


In [16]:
print(resultados_logistica)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [17]:
print(resultados_naive_bayes)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [18]:
resultados_naive_bayes = np.array(resultados_naive_bayes)
resultados_logistica = np.array(resultados_logistica)
resultados_forest = np.array(resultados_forest)

### Média

In [19]:
resultados_naive_bayes.mean()

0.92425

In [20]:
resultados_logistica.mean()

0.9145

In [21]:
resultados_forest.mean()

0.9836666666666666

### Moda

In [24]:
stats.mode(resultados_naive_bayes)

ModeResult(mode=array([0.9175]), count=array([5]))

In [25]:
stats.mode(resultados_logistica)

ModeResult(mode=array([0.9075]), count=array([4]))

In [26]:
stats.mode(resultados_forest)

ModeResult(mode=array([0.985]), count=array([8]))

### Mediana

In [27]:
np.median(resultados_forest)

0.985

In [28]:
np.median(resultados_logistica)

0.9125

In [29]:
np.median(resultados_naive_bayes)

0.925

### Variância

In [34]:
np.set_printoptions(suppress=True)

In [35]:
np.var(resultados_forest)

3.0305555555555615e-05

In [36]:
np.var(resultados_naive_bayes)

8.756250000000001e-05

In [37]:
np.var(resultados_logistica)

0.00020933333333333337

In [38]:
np.min([0.00020933333333333337,8.756250000000001e-05,3.0305555555555615e-05])

3.0305555555555615e-05

In [39]:
np.max([0.00020933333333333337,8.756250000000001e-05,3.0305555555555615e-05])

0.00020933333333333337

### Desvio Padrão

In [41]:
np.std(resultados_forest)

0.005505048188304587

In [42]:
np.std(resultados_logistica)

0.014468356276140472

In [43]:
np.std(resultados_naive_bayes)

0.00935748363610645

### Coeficiente de variação

In [44]:
stats.variation(resultados_naive_bayes) * 100

1.0124407504578252

In [45]:
stats.variation(resultados_forest) * 100

0.5596456985738313

In [46]:
stats.variation(resultados_logistica) * 100

1.5821056616884057

### Validação Cruzada

In [47]:
from sklearn.model_selection import cross_val_score, KFold

In [48]:
resultados_naive_bayes_cv = []
resultados_logistica_cv = []
resultados_forest_cv = []

for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    
    naive_bayes = GaussianNB()
    scores = cross_val_score(naive_bayes, X, y, cv=kfold)
    resultados_naive_bayes_cv.append(scores.mean())
    
    logistica = LogisticRegression()
    scores = cross_val_score(logistica, X, y, cv=kfold)
    resultados_logistica_cv.append(scores.mean())
    
    forest = RandomForestClassifier()
    scores = cross_val_score(forest, X, y, cv=kfold)
    resultados_forest_cv.append(scores.mean())

In [49]:
scores, 10 * 30

(array([0.99      , 0.98      , 0.985     , 0.99      , 0.98      ,
        0.985     , 0.99      , 0.98492462, 0.98492462, 1.        ]),
 300)

In [50]:
scores.mean()

0.9869849246231157

In [57]:
stats.variation(resultados_forest_cv) * 100, stats.variation(resultados_logistica_cv) * 100, stats.variation(resultados_naive_bayes_cv) * 100

(0.12464506440504582, 0.38801026116292653, 0.08641071566366061)

In [60]:
stats.variation(resultados_forest) * 100, stats.variation(resultados_logistica) * 100, stats.variation(resultados_naive_bayes) * 100

(0.5596456985738313, 1.5821056616884057, 1.0124407504578252)

### Seleção de atributos utilizando variância

In [61]:
base_selecao = {'a': np.random.rand(20),
                'b': np.array([0.5] * 20),
                'classe': np.random.randint(0,2, size = 20)}

In [62]:
df = pd.DataFrame(base_selecao)
df

Unnamed: 0,a,b,classe
0,0.805022,0.5,0
1,0.395185,0.5,1
2,0.886563,0.5,0
3,0.40788,0.5,1
4,0.203137,0.5,0
5,0.690275,0.5,0
6,0.86078,0.5,0
7,0.41372,0.5,0
8,0.863417,0.5,0
9,0.30746,0.5,1


In [64]:
df.describe()

Unnamed: 0,a,b,classe
count,20.0,20.0,20.0
mean,0.516692,0.5,0.3
std,0.256757,0.0,0.470162
min,0.045514,0.5,0.0
25%,0.317694,0.5,0.0
50%,0.41094,0.5,0.0
75%,0.77511,0.5,1.0
max,0.886563,0.5,1.0


In [66]:
np.var(df['a']), np.var(df['b'])

(0.06262815209959895, 0.0)

In [67]:
X = df.iloc[:,0:2].values

In [68]:
from sklearn.feature_selection import VarianceThreshold

In [69]:
selecao = VarianceThreshold(threshold=0.05)
X_novo = selecao.fit_transform(X)
X_novo

array([[0.80502234],
       [0.3951853 ],
       [0.88656347],
       [0.40788002],
       [0.20313672],
       [0.69027512],
       [0.86078046],
       [0.41372011],
       [0.86341686],
       [0.30746011],
       [0.38970412],
       [0.3056195 ],
       [0.3211058 ],
       [0.27076052],
       [0.76513907],
       [0.83974986],
       [0.48263505],
       [0.0455143 ],
       [0.40815892],
       [0.6720039 ]])

In [70]:
selecao.variances_

array([0.06262815, 0.        ])

In [75]:
indices = np.where(selecao.variances_ > 0.05)
indices

(array([0], dtype=int64),)