In [1]:
import numpy as np
import scipy.stats
import collections

In [2]:
f = open('/usr/local/data/transactions.txt', 'r')
file = f.read().strip().split('\n')

In [4]:
for i in range(len(file)):
     file[i] = file[i].split(',')

In [5]:
class Transaction(object):
     def __init__(self, num=None,  ID=None, volume=None, segment=None):
        self.num = num
        self.ID = ID
        self.volume = volume
        self.segment = segment

In [6]:
def _struct_list_(file):
    struct_list=[]
    for i in range(len(file)):
        struct_list.append(Transaction(num=int(file[i][0]), ID=file[i][1], volume=float(file[i][2]), segment=file[i][3]))
    
    return struct_list

In [22]:
class StructList(object):
    def __init__(self, struct_list=None, length=None):
        self.struct_list = struct_list
        self.length = length
        
    def _clients_number_(self, segment):
        clients_id = []
        for i in range(self.length):
            if self.struct_list[i].segment == segment:
                clients_id.append(self.struct_list[i].ID)
        
        counter = collections.Counter(clients_id)
        clients_number = len(counter)
        
        return clients_number
    
    def _transactions_number_(self, segment):
        number = 0
        for i in range(self.length):
            if self.struct_list[i].segment == segment:
                number += 1
            
        return number
    
    def _avg_volume_(self, segment):
        volume = 0
        for i in range(self.length):
            if self.struct_list[i].segment == segment:
                volume += self.struct_list[i].volume
        
        number = self._transactions_number_(segment)
        avg = volume/number
        
        return avg
    
    def _std_volume_(self, segment):
        sVar = 0
        avg = self._avg_volume_(segment)
        for i in range(self.length):
            if self.struct_list[i].segment == segment:
                sVar += (self.struct_list[i].volume - avg)**2
        
        number = self._transactions_number_(segment)
        sVar = sVar/(number - 1)
        
        return sVar**0.5

    '''
    Assuming average volume is normally distributed and Student distribution can be well approximated by normal one
    on a large sample
    '''
    
    def _avg_interval_(self, segment, rate):
        
        avg = self._avg_volume_(segment)
        std = self._std_volume_(segment)
        number = self._transactions_number_(segment)
        
        avg_min = avg - std/(number**0.5)*scipy.stats.norm.ppf(rate + 0.5*(1 - rate))
        avg_max = avg + std/(number**0.5)*scipy.stats.norm.ppf(rate + 0.5*(1 - rate))
        
        return np.array((avg_min, avg_max))

    def _avg_equity_hypothesis_(self, level):
        
        r_std = self._std_volume_(segment='R')
        af_std = self._std_volume_(segment='AF')
        
        r_num = self._transactions_number_(segment='R')
        af_num = self._transactions_number_(segment='AF')
        
        std = (r_std**2/r_num + af_std**2/af_num)**0.5
        
        r_avg = self._avg_volume_(segment='R')
        af_avg = self._avg_volume_(segment='AF')
        
        stats = abs(r_avg - af_avg)
        thstats = std*scipy.stats.norm.ppf(1 - level/2)
        
        if stats < thstats:
            answer = 'not rejected'
        else:
            answer = 'rejected'
        
        return answer

In [23]:
struct_list = _struct_list_(file)

In [24]:
sl = StructList(struct_list, len(struct_list))

In [25]:
r_number = sl._clients_number_(segment='R')
af_number = sl._clients_number_(segment='AF')

In [26]:
r_avg = sl._avg_volume_(segment='R')
af_avg = sl._avg_volume_(segment='AF')

In [27]:
for num, segm in zip([r_number, af_number], ['R', 'AF']):
    print("{} clients made transactions in segment '{}'".format(num, segm))

2 clients made transactions in segment 'R'
2 clients made transactions in segment 'AF'


In [28]:
for segm, avg in zip(['R', 'AF'], [r_avg, af_avg]):
    print("Average volume of a single transaction in segment '{}' is {:.2f}".format(segm, avg))

Average volume of a single transaction in segment 'R' is 2486.67
Average volume of a single transaction in segment 'AF' is 35299.97


In [29]:
rate = 0.9
level = 0.1

In [30]:
r_interval = sl._avg_interval_(segment='R', rate=rate)
af_interval = sl._avg_interval_(segment='AF', rate=rate)

In [31]:
for segm, interval in zip(['R', 'AF'], [r_interval, af_interval]):
    print("{}% confidence interval for average volume of a single transaction in segment {} is [{:.2f}, {:.2f}]".format(rate*100, segm, interval[0], interval[1]))

90.0% confidence interval for average volume of a single transaction in segment R is [2485.17, 2488.17]
90.0% confidence interval for average volume of a single transaction in segment AF is [35247.92, 35352.02]


In [32]:
answer = sl._avg_equity_hypothesis_(level=level)

In [33]:
print("On a significance level of {}% null hypothesis for equality of average volumes of single transactions in segments 'R' and 'AF' is {}".format(level*100, answer))

On a significance level of 10.0% null hypothesis for equality of average volumes of single transactions in segments 'R' and 'AF' is rejected
