In [1]:
def import_data(filename):
    import csv
    X = []
    y = []
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            X.append([])
            for i in range(len(row)):
                val = row[i]
                X[line_count].append(val)
            line_count += 1

    return X

In [2]:
X = import_data("Foia.csv")
Y = []
for i in range(len(X)):
    Y.append(X[i][-1])
Y[:3]

['All records regarding the service of the 208th Engineer Combat Battalion anytime between December 7, 1941 and January 1, 1947.',
 'Information relating to an operation at the end of the Gulf War in April of 1991\ndubbed "Operation Manly Rip".',
 'A clearly releasable copy of Sections A through J of the awarded contract, including the statement of work, for the contract awarded from solicitation number HROO11O6ROO2.']

In [38]:
import gensim

def read_corpus(Y):
    out = []
    for i in range(len(Y)):
        tokens = gensim.utils.simple_preprocess(Y[i])
        # For training data, add tags
        out.append(gensim.models.doc2vec.TaggedDocument(tokens, [i]))
    return out

corpus = read_corpus(Y)
print(corpus)





In [43]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=2, epochs=100)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [44]:
vector = model.infer_vector(corpus[0][0])
print(vector)

[-0.5396813   0.76145005  0.08799104 -0.8552484  -0.07422578 -0.27497628
  0.14400956 -0.06256936  1.2230239   0.6693135   0.45133227  0.6130142
 -2.3242223  -1.2864139   0.01614537  0.1510122  -0.21613057  0.17834423
  0.2798907   0.94683796]


In [45]:
ranks = []
second_ranks = []
for doc_id in range(len(corpus)):
    if doc_id % 100 == 0:
        print(doc_id)
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500


In [46]:
import collections

counter = collections.Counter(ranks)
print(counter.most_common())

[(0, 2452), (1, 54), (2, 14), (3, 7), (6, 3), (4, 3), (5, 2), (14, 1), (9, 1), (1358, 1), (21, 1), (28, 1), (2489, 1), (2406, 1), (2328, 1), (2291, 1)]


In [53]:
from __future__ import absolute_import

import random

import matplotlib.pyplot as plt
import numpy as np
import scipy.io
import scipy.misc
from numpy import dot
from numpy.linalg import norm

def cossim(a, b):
    return dot(a, b)/(norm(a)*norm(b))


In [64]:
def choose_random_centroids(K):
    """
    Randomly choose K centroids from samples.
    :param samples: samples.
    :param K: K as in K-means. Number of clusters.
    :return: an array of centroids.
    """
    samples_copy = np.array(range(len(model.docvecs)), copy=True)
    np.random.shuffle(samples_copy)
    samples = []
    for i in range(K):
        samples.append(model.infer_vector(corpus[i].words))
    return samples

cent = choose_random_centroids(5)
print(cossim(cent[0], cent[1]))

0.4620855


In [66]:
model.infer_vector(corpus[i].words).shape[0]

20

In [74]:
def find_closest_centroids(centroids):
    """
    Find the closest centroid for all samples.

    :param samples: samples.
    :param centroids: an array of centroids.
    :return: a list of cluster_id assignment.
    """

    closest = []
    for i in range(len(model.docvecs)):
        min = cossim(model.infer_vector(corpus[i].words), centroids[0])
        min_index = 0
        for j in range(1, len(centroids)):
            temp = cossim(model.infer_vector(corpus[i].words), centroids[j])
            if temp < min:
                min = temp
                min_index = j
        closest.append(min_index)
    return np.array(closest)

def get_centroids(clusters):
    """
    Find the centroid given the samples and their cluster.

    :param samples: samples.
    :param clusters: list of clusters corresponding to each sample.
    :return: an array of centroids.
    """

    num_clusters = 0
    for i in range(len(clusters)):
        if clusters[i] > num_clusters:
            num_clusters = clusters[i]
    num_clusters += 1

    count = np.zeros(int(num_clusters))
    #print(samples[0].shape[0])
    #print()
    means = np.zeros((int(num_clusters), model.infer_vector(corpus[i].words).shape[0]))
    for i in range(len(clusters)):
        count[int(clusters[i])] += 1.0
        means[int(clusters[i])] += model.infer_vector(corpus[i].words)

    for i in range(int(num_clusters)):
        means[i] = means[i] / count[i]
    #print(means)
    return means

def run_k_means(K, n_iter):
    """
    Run K-means algorithm. The number of clusters 'K' is defined by the size of initial_centroids
    :param samples: samples.
    :param K: K as in K-means. Number of clusters.
    :param n_iter: number of iterations.
    :return: a pair of cluster assignment and history of centroids.
    """

    centroid_history = []
    current_centroids = choose_random_centroids(K)
    clusters = []
    for iteration in range(n_iter):
        print(iteration)
        centroid_history.append(current_centroids)
        print("Iteration %d, Finding centroids for all samples..." % iteration)
        clusters = find_closest_centroids(current_centroids)
        print("Recompute centroids...")
        current_centroids = get_centroids(clusters)

    return clusters, centroid_history

In [75]:
clusters, centroid_history = run_k_means(10, 10)

0
Iteration 0, Finding centroids for all samples...
Recompute centroids...




1
Iteration 1, Finding centroids for all samples...
Recompute centroids...
2
Iteration 2, Finding centroids for all samples...
Recompute centroids...
3
Iteration 3, Finding centroids for all samples...
Recompute centroids...
4
Iteration 4, Finding centroids for all samples...
Recompute centroids...
5
Iteration 5, Finding centroids for all samples...
Recompute centroids...
6
Iteration 6, Finding centroids for all samples...
Recompute centroids...
7
Iteration 7, Finding centroids for all samples...
Recompute centroids...
8
Iteration 8, Finding centroids for all samples...
Recompute centroids...
9
Iteration 9, Finding centroids for all samples...
Recompute centroids...


In [86]:
index_lists = []
for i in range(10):
    index_lists.append([])

for i in range(len(clusters)):
    index_lists[clusters[i]].append(i)

for i in range(len(index_lists)):
    print(i, len(index_lists[i]))

for i in range(len(index_lists[6])):
    print(Y[index_lists[6][i]])
    print()

0 145
1 131
2 468
3 78
4 726
5 0
6 50
7 34
8 796
9 116
All documents from March 1 through December 31, 2003 concerned with
discussions with the United Kingdom regarding 1) the establishment of the Coalition Provisional Authority in Iraq; and 2) the legal status of the CPA.

A document entitled “Agreement Between the Government of the United States of America and the Government of the State of Kuwait Concerning Defense Cooperation , Use of Facilities, Logistical Support, Propositioning of Defense Materials and the Status of the Forces of the United States of America in the State of Kuwait,” signed on or around’ September 19, 1991.

The Februray 1, 2003, twenty-five page proposed draft of Secretary of State Colin Powell's presentation to the United Nations(UN) regarding Iraq.

Disclosure of any documents prepared by the Office of International Security Affairs and the Joint Staff in late January 1977 produced in response to queries from President Carter and the National Security Advisor 