# Exploratory Data A

In [1]:
import os
import numpy as np
import pandas as pd

import cPickle as pkl

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics.pairwise import cosine_similarity

## Load data

In [2]:
os.chdir('data/')

data = []
for filename in os.listdir(os.getcwd()):
    if filename[-4:] == '.pkl':
        print 'Reading', filename, 
        with open(filename) as f:
            currentData = pkl.load(f)
        print "   ", len(currentData), "articles"
        data += currentData
        
os.chdir('../')

Reading Machine_learning_algorithms.pkl     94 articles
Reading Organs_(anatomy).pkl     420 articles
Reading Congenital_disorders.pkl     636 articles
Reading Medical_devices.pkl     182 articles
Reading Infectious_diseases.pkl     1067 articles
Reading Rare_diseases.pkl     907 articles
Reading Cancer.pkl     816 articles


In [3]:
article_text = []
other_numbers = []  # to store image, equation, link, and citation counts
target = []

for item in data:
    article_text.append(item[0])
    other_numbers.append([item[1], item[2], item[3], item[4], item[5]])
    target.append(item[5])

In [6]:
type(other_numbers[100:])

list

In [7]:
other_numbers[100:][-1]

[8, 0, 328, 79, 'Cancer']

In [8]:
df = pd.DataFrame(other_numbers)
df.columns = ["num_non_math_images", 'num_of_math_eqns', 'num_links', 'num_citations', 'category']

In [9]:
df.groupby(['category']).std()

Unnamed: 0_level_0,num_non_math_images,num_of_math_eqns,num_links,num_citations
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cancer,9.249772,0.24991,328.478893,36.114544
Congenital_disorders,6.725081,0.0,342.912867,29.105413
Infectious_diseases,10.604451,0.0,292.786935,44.013714
Machine_learning_algorithms,3.017849,38.624621,88.327641,10.994539
Medical_devices,6.975442,1.132688,190.325188,25.065854
Organs_(anatomy),13.806107,0.084313,321.661958,34.892504
Rare_diseases,2.432773,0.066409,209.100502,29.104963


In [10]:
categories = set(df['category'])
categories = sorted(categories)
print categories

['Cancer', 'Congenital_disorders', 'Infectious_diseases', 'Machine_learning_algorithms', 'Medical_devices', 'Organs_(anatomy)', 'Rare_diseases']


In [31]:
means = df.groupby(['category']).mean().values
medians = df.groupby(['category']).median().values
stdevs = df.groupby(['category']).std().values

for i in xrange(len(means)):
    print categories[i],
    for j in xrange(4):
        print "| %.1f, %.1f, %.1f" % (means[i][j], medians[i][j], stdevs[i][j]), 
    print ''

Cancer | 6.5, 5.0, 9.2 | 0.0, 0.0, 0.2 | 207.9, 87.0, 328.5 | 19.6, 7.0, 36.1 
Congenital_disorders | 5.7, 4.0, 6.7 | 0.0, 0.0, 0.0 | 219.1, 139.5, 342.9 | 15.0, 6.0, 29.1 
Infectious_diseases | 6.4, 5.0, 10.6 | 0.0, 0.0, 0.0 | 266.6, 185.0, 292.8 | 23.1, 7.0, 44.0 
Machine_learning_algorithms | 5.0, 4.0, 3.0 | 18.3, 0.0, 38.6 | 86.4, 54.0, 88.3 | 7.0, 3.0, 11.0 
Medical_devices | 10.2, 6.5, 7.0 | 0.1, 0.0, 1.1 | 202.9, 177.0, 190.3 | 16.9, 7.0, 25.1 
Organs_(anatomy) | 8.6, 6.0, 13.8 | 0.0, 0.0, 0.1 | 239.4, 149.0, 321.7 | 21.2, 7.0, 34.9 
Rare_diseases | 5.0, 4.0, 2.4 | 0.0, 0.0, 0.1 | 251.6, 199.0, 209.1 | 18.4, 9.0, 29.1 


In [36]:
print df.mean().values
print df.median().values
print df.std().values

[   6.34764677    0.42600679  234.64653081   19.29767103]
[   5.     0.   151.5    7. ]
[   8.81518872    6.42100434  290.83519041   35.22531131]


In [13]:

for i, item in enumerate(means):
    print categories[i], "|", 
    for num in item:
        print "%10.2f" %num, "|", 
    print ''

Cancer |       6.50 |       0.01 |     207.94 |      19.60 | 
Congenital_disorders |       5.66 |       0.00 |     219.13 |      14.97 | 
Infectious_diseases |       6.41 |       0.00 |     266.55 |      23.13 | 
Machine_learning_algorithms |       5.01 |      18.34 |      86.41 |       7.04 | 
Medical_devices |      10.20 |       0.10 |     202.86 |      16.92 | 
Organs_(anatomy) |       8.57 |       0.01 |     239.41 |      21.16 | 
Rare_diseases |       4.95 |       0.00 |     251.55 |      18.44 | 


In [14]:

for i, item in enumerate(stdevs):
    print categories[i], "|", 
    for num in item:
        print "%10.2f" %num, "|", 
    print ''

Cancer |       9.25 |       0.25 |     328.48 |      36.11 | 
Congenital_disorders |       6.73 |       0.00 |     342.91 |      29.11 | 
Infectious_diseases |      10.60 |       0.00 |     292.79 |      44.01 | 
Machine_learning_algorithms |       3.02 |      38.62 |      88.33 |      10.99 | 
Medical_devices |       6.98 |       1.13 |     190.33 |      25.07 | 
Organs_(anatomy) |      13.81 |       0.08 |     321.66 |      34.89 | 
Rare_diseases |       2.43 |       0.07 |     209.10 |      29.10 | 


### Split data into train and test and iterate over different values of tf-idf dimensions

In [9]:
wordnet = WordNetLemmatizer()

def my_tokenize(doc):
    tok = word_tokenize(doc)
    return[wordnet.lemmatize(x) for x in tok]

In [52]:
count_vect = CountVectorizer(stop_words = 'english', max_features = 700, tokenizer = my_tokenize)
X_counts = count_vect.fit_transform(article_text)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [53]:
X_tfidf.shape

(4122, 700)

In [78]:
indices = []
category_name = []

count = 0
for i in xrange(len(other_numbers)-1):
    count += 1

    if other_numbers[i][-1] != other_numbers[i+1][-1]:
        indices.append(count)
        category_name.append(other_numbers[i][-1])
indices.append(count)

In [81]:
print indices
print category_name

[94, 514, 1150, 1332, 2399, 3306, 4121]
['Machine_learning_algorithms', 'Organs_(anatomy)', 'Congenital_disorders', 'Medical_devices', 'Infectious_diseases', 'Rare_diseases']


In [68]:
other_numbers[93][-1]

'Machine_learning_algorithms'

In [72]:
mean_vectors = []
start = 0
for end in indices:
    mean_vectors.append(np.mean((X_tfidf[start:end, :]).todense(), axis = 0))
    start = end

In [86]:
similar_tups = []
for i in xrange(len(mean_vectors)):
    for j in xrange(i+1, len(mean_vectors)):
        if j == len(mean_vectors) -1:
            break
        similar_tups.append((category_name[i], category_name[j], cosine_similarity(mean_vectors[i], mean_vectors[j])))

In [91]:
similar_tups = sorted(similar_tups, key = lambda x: x[2], reverse = True)
for tup in similar_tups:
    print " %0.4f | %s | %s |" %(tup[2], tup[1], tup[0] )

 0.8998 | Rare_diseases | Congenital_disorders |
 0.5518 | Rare_diseases | Infectious_diseases |
 0.5065 | Rare_diseases | Organs_(anatomy) |
 0.5064 | Congenital_disorders | Organs_(anatomy) |
 0.4829 | Infectious_diseases | Congenital_disorders |
 0.4776 | Infectious_diseases | Medical_devices |
 0.4759 | Medical_devices | Organs_(anatomy) |
 0.4697 | Infectious_diseases | Organs_(anatomy) |
 0.4002 | Medical_devices | Congenital_disorders |
 0.3953 | Rare_diseases | Medical_devices |
 0.2265 | Medical_devices | Machine_learning_algorithms |
 0.2082 | Organs_(anatomy) | Machine_learning_algorithms |
 0.1969 | Rare_diseases | Machine_learning_algorithms |
 0.1925 | Congenital_disorders | Machine_learning_algorithms |
 0.1767 | Infectious_diseases | Machine_learning_algorithms |
