In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
import math
import scipy as stats
import scipy.stats

In [2]:
from stat1 import *

In [3]:
%matplotlib inline

In [4]:
class Sampling_Distibution(Probability_Distribution):
    """
    Stratified Sampling: use when each group has small variation
        within itself but there is a wide variation between the 
        groups.
        
    Cluster Sampling: use when there is considerable variation
        within each group but the groups are essentially similar
        to each other.
    """
    
    def __init__(self):
        pass
    
    
    def standard_error(self, sigma, n):
        """
        To  get sampling distribution standard deviation
        for infinite population or from finite sample with
        replacements
        
        sigma: population standard deviation
        n: sample size
        """
        return sigma/(math.sqrt(n))
    
    
    def probability(self, mu, sigma, n, N=0, X1=-math.inf, X2=math.inf, infinite=True):
        """
        mu = population mean
        sigma = population standard deviation
        infinte = bool, if true then find probaility for infinite 
        population, otherwise for finite population
        """
        if infinite is True:
            std_error = self.standard_error(sigma, n)
        elif infinite is False:
            std_error = self.standard_error_finite(sigma, N, n)
        print("std error:", std_error)
        
        return self.normal_distribution(mu, std_error, X1, X2)
    
    
    """
    CENTRAL LIMIT THEOREM:
    
    1. The mean of the sampling distribution of the mean
        will be equal to the population mean.
    2. As the sample size increases, the sampling distribution
        of the mean will approach normality.
        
    Significance:
        it permits use sample statics to make inferences about
        population parameterswithout knowing anything about
        the shape of the frequency distribution of that population
        other than what we can get from the sample.
    """
        
    def finite_population_multiplier(self, N, n):
        """
        N: Size of the population
        n: Size of sample
        
        Note: When sampling fraction(n/N) is less than 0.05, the
            finite population multiplier need not be used.
        """
        return math.sqrt((N-n)/(N-1))
    
    
    def standard_error_finite(self, sigma, N, n):
        """
        To  get sampling distribution standard deviation
        for finite sample without replacements
        
        sigma: population standard deviation
        N: Size of the population
        n: Size of sample
        """
        return self.standard_error(sigma, n) * self.finite_population_multiplier(N, n)
    

In [5]:
class Estimation(object):
    
    def __init__(self):
        pass

In [20]:
sd = Sampling_Distibution()

In [26]:
n = 15
sigma = 4
N = 70
mu = 18

x1 = np.array([-math.inf, 20])
x2 = np.array([15.5, math.inf])

res = []
for i in range(len(x1)):
    res.append(sd.probability(mu, sigma, n, N, x1[i], x2[i], infinite=False))
    
res

std error: 0.9220854330774994
-inf -2.71
a2
z string: 2.71
first, last: 2.7 1
std error: 0.9220854330774994
2.17 inf
a3
z string: 2.17
first, last: 2.1 7


[0.00336, 0.015]

In [23]:
sd.probability(mu, sigma, n,N, x1, x2, infinite=False)

std error: 0.17192507941463023
-1.74 2.33
a7
z string: 2.33
first, last: 2.3 3
z string: 1.74
first, last: 1.7 4
probabilities 1 & 2 0.4901 0.45907


0.94917

In [25]:
(5249/(4*265))**2

24.521182805268783

In [9]:
mu = 8.2
sigma = 2.1
N = 80
n = np.array([16,25,49])

x1 = 21
x2 = 23.5

print(sd.standard_error_finite(sigma, N, n))
print("-------------------------------")
print(sd.probability(mu, sigma, n, x1, x2, infinite=False))
print("-------------------------------")
print(sd.standard_error(sigma, n))

0.5340080120766217
-------------------------------
-1.87 2.81
a7
z string: 2.81
first, last: 2.8 1
z string: 1.87
first, last: 1.8 7
probabilities 1 & 2 0.49751999999999996 0.46926
0.96678
-------------------------------
0.64


In [14]:
n = np.array([35])
mu = 64
sigma = math.sqrt(17.6)
x1 = np.array([72,64,64,74,100])
x2 = np.array([math.inf, 72,64,math.inf,100])

prob = []
for i in range(len(n)):
    pr = []
    for j in range(len(x1)):
        try:
            pr.append(sd.probability(mu,sigma,n[i],x1[j],x2[j]))
        except:
            pr.append("Can't be determined")
        print("------------------------------")
    prob.append(pr)
prob

11.28 inf
a3
z string: 11.28
first, last: 11.2 8
------------------------------
0.0 11.28
a5
z string: 11.28
first, last: 11.2 8
------------------------------
0.0 0.0
------------------------------
14.1 inf
a3
z string: 14.1
first, last: 14.0 1
------------------------------
50.77 50.77
a6
z string: 50.77
first, last: 50.7 7
z string: 50.77
first, last: 50.7 7
probabilities 1 & 2 0.0 0.0
------------------------------


[[0.0, 0.0, None, 0.0, 0.0]]

In [14]:
sd.probability(150, 16,16, -math.inf, 160)

-inf 2.5
a1
z string: 2.5
first, last: 2.5 0


0.99379

In [25]:
sd.standard_error(12,60)

1.5491933384829668

In [10]:
mu = 8.2
sigma = 2.1
N = 80
n = np.array([16,25,49])

res = []
for i in n:
    res.append(sd.standard_error_finite(sigma, N, i))
    
res

[0.4725369183889301, 0.350442757925283, 0.1879267295948145]

In [17]:
a1 = np.arange(0.00, 4.10, 0.01)
a2 = np.arange(0.00, 4.10, 0.01)
# a1<a2

In [47]:
#file = 'cumulative_from_mean_0toZ.csv'
file = 'complementary_cumulative.csv'
df_file = pd.read_csv(file)
df_file.index = df_file['z']
df_file.drop('z', axis=1, inplace=True)

In [48]:
df_file.head()

Unnamed: 0_level_0,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.5,0.49601,0.49202,0.48803,0.48405,0.48006,0.47608,0.4721,0.46812,0.46414
0.1,0.46017,0.4562,0.45224,0.44828,0.44433,0.44038,0.4364,0.43251,0.42858,0.42465
0.2,0.42074,0.41683,0.41294,0.40905,0.40517,0.40129,0.39743,0.39358,0.38974,0.38591
0.3,0.38209,0.37828,0.37448,0.3707,0.36693,0.36317,0.35942,0.35569,0.35197,0.34827
0.4,0.34458,0.3409,0.33724,0.3336,0.32997,0.32636,0.32276,0.31918,0.31561,0.31207


In [49]:
dfr = df_file.round(2)
dfr1 = dfr[dfr == 0.05]

In [51]:
dfr1

Unnamed: 0_level_0,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,,,,,,,,,,
0.1,,,,,,,,,,
0.2,,,,,,,,,,
0.3,,,,,,,,,,
0.4,,,,,,,,,,
0.5,,,,,,,,,,
0.6,,,,,,,,,,
0.7,,,,,,,,,,
0.8,,,,,,,,,,
0.9,,,,,,,,,,


In [45]:
dfr.head()

Unnamed: 0_level_0,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.0,0.0,0.01,0.01,0.02,0.02,0.02,0.03,0.03,0.04
0.1,0.04,0.04,0.05,0.05,0.06,0.06,0.06,0.07,0.07,0.08
0.2,0.08,0.08,0.09,0.09,0.09,0.1,0.1,0.11,0.11,0.11
0.3,0.12,0.12,0.13,0.13,0.13,0.14,0.14,0.14,0.15,0.15
0.4,0.16,0.16,0.16,0.17,0.17,0.17,0.18,0.18,0.18,0.19


In [95]:
l1 = []
for i in dfr.columns:
    sd = dfr[dfr[i]==0.05].index.tolist()
    if len(sd)>0:
        t = (sd[0], i)
        l1.append(t)

l1

[(1.6, '0'),
 (1.6, '0.01'),
 (1.6, '0.02'),
 (1.6, '0.03'),
 (1.6, '0.04'),
 (1.6, '0.05'),
 (1.6, '0.06'),
 (1.6, '0.07'),
 (1.6, '0.08'),
 (1.6, '0.09')]

In [53]:
def standard_deviation_from_mean(probability):
    n = len(str(probability))-2
    
    file = 'complementary_cumulative.csv'
    df_file = pd.read_csv(file)
    df_file.index = df_file['z']
    df_file.drop('z', axis=1, inplace=True)
    
    dfr = df_file.round(n)
    
    l1 = []
    sd0 = set([])
    for i in dfr.columns:
        sd = dfr[dfr[i]==probability].index.tolist()
        print(sd)
        if len(sd)>0:
            for j in sd:
                #print("value: ", j, float(i))
                t = j + float(i)
                if i not in l1:
                    l1.append(round(t, 2))
            
              
    return np.array(sorted(l1))

In [54]:
standard_deviation_from_mean(0.05)

array([1.6 , 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69])

In [62]:
#dfr[dfr == 0.02]
sd.standard_deviation_from_mean(0.082, 'complementary_cumulative.csv')

array([1.39])

In [63]:
cmf, cml=0.00000, 0.49998
cf, cl=0.50000, 0.99998
ccf, ccl = 0.50000, 0.00002
gzf, gzl =5.00000e-1, 3.07102e-1358

In [64]:
sorted([cmf, cml, cf, cl, ccf, ccl, gzf, gzl])

[0.0, 0.0, 2e-05, 0.49998, 0.5, 0.5, 0.5, 0.99998]

In [68]:
5.00000E-1

0.5

In [67]:
3.07102E-1358

0.0