# EECS 498 - Assignment 1 - Collocations.py

### <span style="color:red">Major Task:</span> write a Python program Collocations.py that identifies collocations in text.

### <span style="color:red">Result:</span> built a program that implements the chi-square and the pointwise mutual information (PMI) measures of association for the identification of bigram collocations.

### 1) Read in and process data

In [39]:
import string
import math
import numpy as np
import sys
import operator
from __future__ import division

# create lists for storage
unigram_dict = {}
bigram_dict = {}
prev_word = "."

# save input file & measure type
input_file = 'Collocations'
#input_file = sys.argv[1]
#measure_type = sys.argv[2]

# read in file line-by-line, word-by-word
with open(input_file) as f:
    for line in f:
        splitLine = line.split()
        for word in splitLine:
            
            # store unigram (no tokens of only punctuation)
            if word not in string.punctuation:
                if word in unigram_dict:
                    unigram_dict[word] += 1
                else:
                    unigram_dict[word] = 1
            
                # store bigram (no tokens of only punctuation)
                if prev_word not in string.punctuation:
                    bigram = prev_word + ' ' + word
                    if bigram in bigram_dict:
                        bigram_dict[bigram] += 1
                    else:
                        bigram_dict[bigram] = 1
            
            # set new prev_word
            prev_word = word
            
# discard bigrams that occur less than 5 times
bigram_dict = { bigram:count for bigram, count in bigram_dict.items() if count >=5 }

In [40]:
print "unicount =", len(unigram_dict), "bicount:", len(bigram_dict)

unicount = 30595 bicount: 10542


### 2) Create bigram confusion matrices

#### Step 1: Loop through bigrams, save 1st/2nd word counts in dictionaries

In [41]:
# create dictionaries with "key-value" pairs of "word-count"
first_word_dict = {}
second_word_dict = {}

# loop through every bigram in the dictionary
for bigram,value in bigram_dict.items():
    
    # extract each bigram, word1, & word2
    word1 = bigram.split()[0]
    word2 = bigram.split()[1]
    
    # store first words
    if word1 in first_word_dict:
        first_word_dict[word1] += 1
    else:
        first_word_dict[word1] = 1
    
    # store second words
    if word2 in second_word_dict:
        second_word_dict[word2] += 1
    else:
        second_word_dict[word2] = 1

#### Step 2: Retrieve and store confusion matrix counts for each bigram

In [42]:
# create dictionary with "key-value" pairs of "bigram-matrix(list)"
confusion_matrices = {}

for bigram,value in bigram_dict.items():
    
    # extract each word1 & word2
    word1 = bigram.split()[0]
    word2 = bigram.split()[1]
    
    # calculate values for confusion matrix
    bigramCount = bigram_dict[bigram] # 1st value: bigram occurences
    secondWordCount = second_word_dict[word2] - bigram_dict[bigram] # 2nd value: word2 occurences (non-bigram)
    firstWordCount = first_word_dict[word1]-bigram_dict[bigram] # 3rd value: word1 occurences (non-bigram)
    neitherCount = len(bigram_dict) - bigramCount - secondWordCount - firstWordCount # 4th value: non word1 or word2
    
    # store values as confusion matrix
    matrix = [bigramCount, secondWordCount, firstWordCount, neitherCount]
    confusion_matrices[bigram] = matrix

### 3) Calculate Chi-Square stat for each bigram

In [43]:
# create dictionary with "key-value" pairs of "bigram-chiSquare"
chi_square_stats = {}

for bigram,matrix in confusion_matrices.items():
    
    # save observed values
    ob11 = matrix[0]
    ob12 = matrix[1]
    ob21 = matrix[2]
    ob22 = matrix[3]
    
    # save col/row total values
    totC1 = ob11 + ob21
    totC2 = ob12 + ob22
    totR1 = ob11 + ob12
    totR2 = ob21 + ob22
    total = ob11 + ob12 + ob21 + ob22
    
    # calculate expected values
    ex11 = (totR1*totC1)/total
    ex12 = (totR1*totC2)/total
    ex21 = (totR2*totC1)/total
    ex22 = (totR2*totC2)/total
    
    # calculate chi_squared statistic
    chi11 = pow((ob11-ex11),2)/ex11
    chi12 = pow((ob12-ex12),2)/ex12
    chi21 = pow((ob21-ex21),2)/ex21
    chi22 = pow((ob22-ex22),2)/ex22
    chi_square = chi11 + chi12 + chi21 + chi22
    
    # store statistic in dictionary with bigram as key
    chi_square_stats[bigram] = chi_square

### 4) Calculate PMI stat for each bigram

In [44]:
# create dictionary with "key-value" pairs of "bigram-PMI"
PMI_stats = {}

for bigram,matrix in confusion_matrices.items():
    
    # split bigram into words
    word1 = bigram.split()[0]
    word2 = bigram.split()[1]
    
    # calculate probability values
    prob_bigram = bigram_dict[bigram]/len(bigram_dict)
    prob_word1 = first_word_dict[word1]/len(first_word_dict)
    prob_word2 = second_word_dict[word2]/len(second_word_dict)
    
    # calculate PMI statistic
    PMI = math.log(prob_bigram/(prob_word1*prob_word2))
    
    # store statistic in dictionary with bigram as key
    PMI_stats[bigram] = PMI

In [45]:
sorted_bigrams_PMI = sorted(PMI_stats.items(), key=operator.itemgetter(1), reverse = True)
sorted_bigrams_chi_square = sorted(chi_square_stats.items(), key=operator.itemgetter(1), reverse = True)

sorted_bigrams_PMI[:25]
sorted_bigrams_chi_square[:25]

[('New York', 280287895.18300825),
 ('Wall Street', 192164135.13591725),
 ('Dow Jones', 97519894.06478672),
 ('Hong Kong', 59309851.57257563),
 ('real estate', 53152163.40788018),
 ('Merrill Lynch', 31895490.84017424),
 ('Stock Exchange', 26961727.12984751),
 ('Los Angeles', 19618851.70110675),
 ('Navigation Mixte', 18599532.486211885),
 ('Big Board', 16539044.039581196),
 ('S&P 500', 14434662.375687597),
 ('Lehman Hutton', 13664952.355290411),
 ('vice president', 13220302.349074317),
 ('San Francisco', 13080434.817291934),
 ("wo n't", 9822289.484212104),
 ('White House', 9469862.965518929),
 ('Sea Containers', 8867446.228448382),
 ('West Germany', 8435920.50873515),
 ('chief executive', 7704807.067436108),
 ('Shearson Lehman', 7613414.45807035),
 ('Control Data', 7127692.182625869),
 ("did n't", 7002874.514172116),
 ('Industrial Average', 6833088.410062129),
 ("does n't", 6584675.441917927),
 ('Du Pont', 6073296.1549236495)]