In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
import math
import scipy as stats
import scipy.stats

In [2]:
from stat1 import *

In [3]:
%matplotlib inline

In [4]:
class Sampling_Distibution(Probability_Distribution):
    """
    Stratified Sampling: use when each group has small variation
        within itself but there is a wide variation between the 
        groups.
        
    Cluster Sampling: use when there is considerable variation
        within each group but the groups are essentially similar
        to each other.
    """
    
    def __init__(self):
        pass
    
    
    def standard_error(self, sigma, n):
        """
        To  get sampling distribution standard deviation
        for infinite population or from finite sample with
        replacements.
        
        
        sigma: population standard deviation
        n: sample size
        """
        return sigma/(math.sqrt(n))
    
    
    def probability(self, mu, sigma, n, N=0, X1=-math.inf, X2=math.inf, infinite=True):
        """
        mu = population mean
        sigma = population standard deviation
        infinte = bool, if true then find probaility for infinite 
        population, otherwise for finite population

        """
        if infinite is True:
            std_error = self.standard_error(sigma, n)
        elif infinite is False:
            std_error = self.standard_error_finite(sigma, N, n)
        print("std error:", std_error)
        
        return self.normal_distribution(mu, std_error, X1, X2)
    
    
    """
    CENTRAL LIMIT THEOREM:
    
    1. The mean of the sampling distribution of the mean
        will be equal to the population mean.
    2. As the sample size increases, the sampling distribution
        of the mean will approach normality.
        
    Significance:
        it permits use sample statics to make inferences about
        population parameterswithout knowing anything about
        the shape of the frequency distribution of that population
        other than what we can get from the sample.
    """
        
    def finite_population_multiplier(self, N, n):
        """
        N: Size of the population
        n: Size of sample
        
        Note: When sampling fraction(n/N) is less than 0.05, the
            finite population multiplier need not be used.
        """
        return math.sqrt((N-n)/(N-1))
    
    """
    Ststistician reccommend that in estimation, n be large
    enough, atlest n/N > 0.5 to use Normal distribution
    as a substitute for the binomial distribution.
    """
    
    def standard_error_finite(self, sigma, N, n):
        """
        To  get sampling distribution standard deviation
        for finite sample without replacements
        
        sigma: population standard deviation
        N: Size of the population
        n: Size of sample
        
        if n/N > 0.05 is True then use standard_error_finite
        otherwise use standard_error
        """
        return self.standard_error(sigma, n) * self.finite_population_multiplier(N, n)
    

In [108]:
class Estimation(Sampling_Distibution):
    
    def __init__(self, x):
        """
        x = np.array of samples
        """
        self.sample = x
    
    
    def point_estimate_mean(self):
        return (self.sample.sum())/len(self.sample)
        
        
    def point_estimate_variance(self):
        variance = ((self.sample - self.point_estimate_mean())**2).sum()/(len(self.sample)-1)
        return variance
        
    
    def point_estimate_deviation(self):
        return math.sqrt(self.point_estimate_variance())
    
    
    def interval_estimate(self, mu, sigma, n, N=0, infinite=True):
        """
        68–95–99.7 rule:
        
        In statistics, the 68–95–99.7 rule, also known as the empirical rule,
        is a shorthand used to remember the percentage of values that lie 
        within a band around the mean in a normal distribution with a width 
        of two, four and six standard deviations, respectively; more accurately,
        68.27%, 95.45% and 99.73% of the values lie within one, two and three 
        standard deviations of the mean, respectively.

        In mathematical notation, these facts can be expressed as follows, where
        Χ is an observation from a normally distributed random variable, μ is 
        the mean of the distribution, and σ is its standard deviation:
        
        Pr(μ - σ ≤ X ≤ μ + σ) ≈ 0.6827
        Pr(μ - 2σ ≤ X ≤ μ + 2σ) ≈ 0.9545
        Pr(μ - 2σ ≤ X ≤ μ + 2σ) ≈ 0.9973
        """
        #n = len(self.sample)
        result = []
        for i in range(1,4):
            if infinite is True:
                t = (mu - i*self.standard_error(sigma,n), mu + i*self.standard_error(sigma,n))
            elif infinite is False:
                t = (mu - i*self.standard_error_finite(sigma, N, n), mu + i*self.standard_error_finite(sigma, N, n))
                
            result.append(t)
        
        return result
    
    
    def standard_error_from_mean(self, confidence_lavel):
        """
        confidence_lavel = percentage of confidence
        """
        
        file = 'cumulative_from_mean_0toZ.csv'
        standard_table = self.standard_normal_table(file)


        dfr = standard_table[standard_table < confidence_lavel/200]
        max_columns = dfr.idxmax(axis=1)
        index = max_columns.last_valid_index()
        column = max_columns.loc[index]

        return round(index + float(column), 2)
    
    
    def confidence_interval_by_std_error(self, confidence, mean, std_error):
        """
        confidence = percentage
        mean = sample mean or population mean
        std_error = 
        """
        return (mean - (confidence * std_error), mean + (confidence * std_error)) 
    
    
    def confidence_interval(self, confidence_lavel, x_bar, sigma, n, N=0, infinite=True):
        """
        confidence_lavel = percentage of confidence
        x_bar: sample mean
        sigma: standard deviation
        """

        conf = self.standard_error_from_mean(confidence_lavel)
        
        if infinite is True:
            std_err = self.standard_error(sigma,n)
        elif infinite is False:
            std_err = self.standard_error_finite(sigma, N, n)
        
        return self.confidence_interval_by_std_error(conf, x_bar, std_err)
    
    """
    Ststistician reccommend that in estimation, n be large
    enough, atlest n/N > 0.5 to use Normal distribution
    as a substitute for the binomial distribution.
    """
    
    #========================================================
    # INTERVAL ESTIMATES OF THE PROPORTION FROM LARGE SAMPLES
    #========================================================
    
    def estimate_mean_of_proportion(self, p_bar):
        """
        p_bar = sample proportion in favour
        """
        return p_bar
    
    
    def estimate_standard_error_of_proportion(self, p_bar, q_bar, n):
        """
        p_bar = sample proportion in favour
        q_bar = sample proportion not in favour
        n = sample size
        """
        return math.sqrt((p_bar * q_bar)/n)
    
    
    
    def confidence_interval_of_proportion(self, confidence, p_bar, q_bar, n):
        """
        confidence = percentage
        p_bar = sample proportion in favour
        q_bar = sample proportion not in favour
        n = sample size
        """
        return self.confidence_interval_by_std_error(self.standard_error_from_mean(confidence), p_bar, self.estimate_standard_error_of_proportion(p_bar, q_bar, n))
    

In [109]:
x = np.array([3, 9, 15, 21, 42, 30, 6, 9, 6, 15, 21, 24, 32, 9, 12])

In [110]:
est = Estimation(x)

In [64]:
c = np.array([60,70,92,96])*0.01

cf = []
for i in c:
    cf.append(est.confidence(i))
    
cf

[0.84, 1.03, 1.75, 2.05]

In [68]:
n = np.array([2,3 ,5,1])
#c = 95.5 * 0.01

sigma = 5/n
x_bar = np.array([25,15,38,20])

#ci = []
ie = []
for i in range(len(n)):
    ie.append((x_bar[i] - (1.96 * sigma[i]), x_bar[i] + (1.67 * sigma[i])))
    
ie

[(20.1, 29.175),
 (11.733333333333334, 17.78333333333333),
 (36.04, 39.67),
 (10.2, 28.35)]

In [135]:
est.point_estimate_mean()

16.933333333333334

In [136]:
est.point_estimate_variance()

125.92380952380952

In [137]:
est.point_estimate_deviation()

11.221577853573423

In [97]:
sigma = 41000
#muh = 30.3
mu = 250000
N = 12368
n = 750
#cd = np.array([95,99])
cd = 90

a = est.standard_error_finite(sigma,N, n)
b = est.confidence_interval(cd, mu, sigma, n, N, infinite=False)
c = est.confidence(cd)
"""
b = []
c = []
for i in cd:
    b.append(est.confidence_interval(i, mu, sigma, n, N, infinite=True))
    c.append(est.confidence(i))
"""
    
a,b,c

(1451.0645447337886, (247620.25414663658, 252379.74585336342), 1.64)

In [112]:
n = 150
p = 0.42
q = 1-p
cd = 99

a = est.estimate_standard_error_of_proportion(p, q, n)
b = est.confidence_interval_of_proportion(cd, p,q,n)
c = est.standard_error_from_mean(cd)
a,b,c

(0.040298883359219766, (0.3164318697668052, 0.5235681302331948), 2.57)

In [126]:
np.array((892.5345509276654, 895.4654490723346))*0.12

array([107.10414611, 107.45585389])

In [139]:
est.probability_of_normal_distribution(-0.45, 0.45)

a7
z string: 0.45
first, last: 0.4 5
z string: 0.45
first, last: 0.4 5
probabilities 1 & 2 0.17364000000000002 0.17364000000000002


0.34728000000000003

In [10]:
mu = 8.2
sigma = 2.1
N = 80
n = np.array([16,25,49])

res = []
for i in n:
    res.append(sd.standard_error_finite(sigma, N, i))
    
res

[0.4725369183889301, 0.350442757925283, 0.1879267295948145]

In [17]:
a1 = np.arange(0.00, 4.10, 0.01)
a2 = np.arange(0.00, 4.10, 0.01)
# a1<a2

In [6]:
file = 'cumulative_from_mean_0toZ.csv'
#file = 'complementary_cumulative.csv'
df_file = pd.read_csv(file)
df_file.index = df_file['z']
df_file.drop('z', axis=1, inplace=True)

In [7]:
df_file.head()

Unnamed: 0_level_0,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.0,0.00399,0.00798,0.01197,0.01595,0.01994,0.02392,0.0279,0.03188,0.03586
0.1,0.03983,0.0438,0.04776,0.05172,0.05567,0.05962,0.06356,0.06749,0.07142,0.07535
0.2,0.07926,0.08317,0.08706,0.09095,0.09483,0.09871,0.10257,0.10642,0.11026,0.11409
0.3,0.11791,0.12172,0.12552,0.1293,0.13307,0.13683,0.14058,0.14431,0.14803,0.15173
0.4,0.15542,0.1591,0.16276,0.1664,0.17003,0.17364,0.17724,0.18082,0.18439,0.18793


In [12]:
df1 = df_file[df_file < 0.90/2]

In [13]:
df1

Unnamed: 0_level_0,0,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.0,0.00399,0.00798,0.01197,0.01595,0.01994,0.02392,0.0279,0.03188,0.03586
0.1,0.03983,0.0438,0.04776,0.05172,0.05567,0.05962,0.06356,0.06749,0.07142,0.07535
0.2,0.07926,0.08317,0.08706,0.09095,0.09483,0.09871,0.10257,0.10642,0.11026,0.11409
0.3,0.11791,0.12172,0.12552,0.1293,0.13307,0.13683,0.14058,0.14431,0.14803,0.15173
0.4,0.15542,0.1591,0.16276,0.1664,0.17003,0.17364,0.17724,0.18082,0.18439,0.18793
0.5,0.19146,0.19497,0.19847,0.20194,0.2054,0.20884,0.21226,0.21566,0.21904,0.2224
0.6,0.22575,0.22907,0.23237,0.23565,0.23891,0.24215,0.24537,0.24857,0.25175,0.2549
0.7,0.25804,0.26115,0.26424,0.2673,0.27035,0.27337,0.27637,0.27935,0.2823,0.28524
0.8,0.28814,0.29103,0.29389,0.29673,0.29955,0.30234,0.30511,0.30785,0.31057,0.31327
0.9,0.31594,0.31859,0.32121,0.32381,0.32639,0.32894,0.33147,0.33398,0.33646,0.33891


In [35]:
def z_number(percentage):
    
    #file = 'complementary_cumulative.csv'
    file = 'cumulative_from_mean_0toZ.csv'
    df_file = pd.read_csv(file)
    df_file.index = df_file['z']
    df_file.drop('z', axis=1, inplace=True)
    
    dfr = df_file[df_file < percentage]
    df2 = dfr.idxmax(axis=1)
    n1 = df2.last_valid_index()
    n2 = df2.loc[n1]
    
    return round(n1 + float(n2), 2)

In [39]:
z_number(0.5)

4.06

In [14]:
df1.columns

Index(['0', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.07', '0.08',
       '0.09'],
      dtype='object')

In [16]:
df1['0.04'].max()

0.4495

In [21]:
df2 = df1.idxmax(axis=1)

In [25]:
df2.last_valid_index()

1.6

In [27]:
df2.index

Float64Index([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2,
              1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5,
              2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8,
              3.9, 4.0],
             dtype='float64', name='z')

In [32]:
df2.loc[1.6]

'0.04'

In [53]:
def standard_deviation_from_mean(probability):
    n = len(str(probability))-2
    
    file = 'complementary_cumulative.csv'
    df_file = pd.read_csv(file)
    df_file.index = df_file['z']
    df_file.drop('z', axis=1, inplace=True)
    
    dfr = df_file.round(n)
    
    l1 = []
    sd0 = set([])
    for i in dfr.columns:
        sd = dfr[dfr[i]==probability].index.tolist()
        print(sd)
        if len(sd)>0:
            for j in sd:
                #print("value: ", j, float(i))
                t = j + float(i)
                if i not in l1:
                    l1.append(round(t, 2))
            
              
    return np.array(sorted(l1))

In [54]:
standard_deviation_from_mean(0.05)

array([1.6 , 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69])

In [62]:
#dfr[dfr == 0.02]
sd.standard_deviation_from_mean(0.082, 'complementary_cumulative.csv')

array([1.39])

In [11]:
g = np.arange(0,20,3)
g

array([ 0,  3,  6,  9, 12, 15, 18])

In [9]:
g-g.sum()

array([-63, -60, -57, -54, -51, -48, -45])

In [13]:
((g-g.sum())**2).sum()

20664

In [67]:
3.07102E-1358

0.0