# Mann Whitney U-Test

In [1]:
import math
class MannWhitney:
    def __init__(self, csv):
        self.csv = csv
        self.grouped_data = []
        self.merged_data = []
        self.value_rank = {}
        self.U_0 = []
        self.U_1 = []
        self.U = 0
        
    def U_Test(self):
        
        #Process CSV (remove \n, split by commas)
        with open(self.csv, 'r') as f:
            fr = f.readlines()
            fr = [row.rstrip() for row in fr]
            fr = [row.split(',') for row in fr]
            
        # Group data and identify by their sample sets
        while len(self.grouped_data) < len(fr[0]):
            self.grouped_data.append([])
        for i in range(1, len(fr)):
            for n in range(len(self.grouped_data)):
                self.grouped_data[n].append([fr[i][n], n])
        
        # Combine Sample Sets and Rank Samples
        for i in self.grouped_data:
            for n in i:
                if n[0] != '':
                    self.merged_data.append(n)
        self.merged_data.sort()
        
        rank = 1
        for i in range(len(self.merged_data)):
            self.merged_data[i].append(rank)
            rank = rank + 1
            
        # Store ranks of occurrences of each value in a dictionary (self.value_rank)
        for i in range(len(self.merged_data)):
            if self.merged_data[i][0] not in self.value_rank:
                self.value_rank[self.merged_data[i][0]] = [self.merged_data[i][2]]
            else:
                self.value_rank[self.merged_data[i][0]].append(self.merged_data[i][2])
                
        # For values that occur > 1, modify the ranks to be the average of ranks using the dictionary above 
        for i in range(len(self.merged_data)):
            if len(self.value_rank[self.merged_data[i][0]]) > 1: 
                self.merged_data[i][2] = sum(self.value_rank[self.merged_data[i][0]]) \
                                        / len(self.value_rank[self.merged_data[i][0]])
        
        # Assign points to each value.  Tabulate scores for each set.  Pick smaller U.
        for i in range(len(self.merged_data)):
            counter = 0
            for n in range(len(self.merged_data)):
                if self.merged_data[i][1] != self.merged_data[n][1] \
                and self.merged_data[i][2] < self.merged_data[n][2]:
                    counter = counter + 1
            self.merged_data[i].append(counter)

        for i in range(len(self.merged_data)):
            if self.merged_data[i][1] == 0: 
                self.U_0.append(self.merged_data[i][3])
            if self.merged_data[i][1] == 1:
                self.U_1.append(self.merged_data[i][3])

        if sum(self.U_0) < sum(self.U_1):
            self.U = self.U + sum(self.U_0)
        else:
            self.U = self.U + sum(self.U_1)
        
        # Calculate and return Z-value
        n_A = 0
        n_B = 0
        for i in range(len(self.merged_data)):
            if self.merged_data[i][1] == 0:
                n_A = n_A + 1
            elif self.merged_data[i][1] == 1:
                n_B = n_B + 1
        Z = (self.U - (n_A * n_B / 2)) / math.sqrt(n_A * n_B *(n_A + n_B + 1) / (n_A + n_B))
        return f"Z-value: {Z}"

In [2]:
csv1 = MannWhitney("mann_whitney.csv")
csv1.U_Test()

'Z-value: -2.8823067684915684'

In [3]:
csv1.merged_data

[['12', 1, 1, 6],
 ['14', 1, 2, 6],
 ['19', 1, 4.0, 6],
 ['19', 1, 4.0, 6],
 ['19', 1, 4.0, 6],
 ['20', 1, 6, 6],
 ['28', 0, 7, 0],
 ['31', 0, 8, 0],
 ['32', 0, 9, 0],
 ['35', 0, 10.5, 0],
 ['35', 0, 10.5, 0],
 ['36', 0, 12, 0]]

## Step by Step Outputs

### Process CSV (remove \n, split by commas)

In [4]:
with open("mann_whitney.csv", 'r') as f:
    fr = f.readlines()
    fr = [row.rstrip() for row in fr]
    fr = [row.split(',') for row in fr]

In [5]:
fr

[['A', 'B'],
 ['28', '12'],
 ['31', '19'],
 ['36', '19'],
 ['35', '14'],
 ['32', '20'],
 ['35', '19']]

### Group data and identify by their sample sets

In [6]:
grouped_data = []
while len(grouped_data) < len(fr[0]):
    grouped_data.append([])
for i in range(1, len(fr)):
    for n in range(len(grouped_data)):
        grouped_data[n].append([fr[i][n], n])
grouped_data

[[['28', 0], ['31', 0], ['36', 0], ['35', 0], ['32', 0], ['35', 0]],
 [['12', 1], ['19', 1], ['19', 1], ['14', 1], ['20', 1], ['19', 1]]]

### Combine Sample Sets and Rank Samples

In [7]:
merged_data = []
for i in grouped_data:
    for n in i:
        if n[0] != '':
            merged_data.append(n)
merged_data.sort()

In [8]:
merged_data

[['12', 1],
 ['14', 1],
 ['19', 1],
 ['19', 1],
 ['19', 1],
 ['20', 1],
 ['28', 0],
 ['31', 0],
 ['32', 0],
 ['35', 0],
 ['35', 0],
 ['36', 0]]

In [9]:
rank = 1
for i in range(len(merged_data)):
    merged_data[i].append(rank)
    rank = rank + 1
merged_data 

[['12', 1, 1],
 ['14', 1, 2],
 ['19', 1, 3],
 ['19', 1, 4],
 ['19', 1, 5],
 ['20', 1, 6],
 ['28', 0, 7],
 ['31', 0, 8],
 ['32', 0, 9],
 ['35', 0, 10],
 ['35', 0, 11],
 ['36', 0, 12]]

### Store ranks of occurrences of each value in a dictionary

In [10]:
value_rank = {}   
for i in range(len(merged_data)):
    if merged_data[i][0] not in value_rank:
        value_rank[merged_data[i][0]] = [merged_data[i][2]]
    else:
        value_rank[merged_data[i][0]].append(merged_data[i][2])
value_rank

{'12': [1],
 '14': [2],
 '19': [3, 4, 5],
 '20': [6],
 '28': [7],
 '31': [8],
 '32': [9],
 '35': [10, 11],
 '36': [12]}

### Modify the ranks (average of ranks) using the dictionary above for values that occur > 1

In [11]:
value_rank[merged_data[3][0]]

[3, 4, 5]

In [12]:
for i in range(len(merged_data)):
    if len(value_rank[merged_data[i][0]]) > 1: 
        merged_data[i][2] = sum(value_rank[merged_data[i][0]]) / len(value_rank[merged_data[i][0]])
merged_data

[['12', 1, 1],
 ['14', 1, 2],
 ['19', 1, 4.0],
 ['19', 1, 4.0],
 ['19', 1, 4.0],
 ['20', 1, 6],
 ['28', 0, 7],
 ['31', 0, 8],
 ['32', 0, 9],
 ['35', 0, 10.5],
 ['35', 0, 10.5],
 ['36', 0, 12]]

### Assign points to each value.  Tabulate scores for each set.  Pick smaller U.

In [13]:
for i in range(len(merged_data)):
    counter = 0
    for n in range(len(merged_data)):
        if merged_data[i][1] != merged_data[n][1] \
        and merged_data[i][2] < merged_data[n][2]:
            counter = counter + 1
    merged_data[i].append(counter)
merged_data

[['12', 1, 1, 6],
 ['14', 1, 2, 6],
 ['19', 1, 4.0, 6],
 ['19', 1, 4.0, 6],
 ['19', 1, 4.0, 6],
 ['20', 1, 6, 6],
 ['28', 0, 7, 0],
 ['31', 0, 8, 0],
 ['32', 0, 9, 0],
 ['35', 0, 10.5, 0],
 ['35', 0, 10.5, 0],
 ['36', 0, 12, 0]]

In [14]:
U_0 = []
U_1 = []
for i in range(len(merged_data)):
    if merged_data[i][1] == 0: 
        U_0.append(merged_data[i][3])
    if merged_data[i][1] == 1:
        U_1.append(merged_data[i][3])
U = 0
if sum(U_0) < sum(U_1):
    U = U + sum(U_0)
else:
    U = U + sum(U_1)
U

0

### Calculate Z-value

In [15]:
import math
n_A = 0
n_B = 0
for i in range(len(merged_data)):
    if merged_data[i][1] == 0:
        n_A = n_A + 1
    elif merged_data[i][1] == 1:
        n_B = n_B + 1
z = (U - (n_A * n_B / 2)) / math.sqrt(n_A * n_B *(n_A + n_B + 1) / (n_A + n_B))
z

-2.8823067684915684