In [1]:
import os, sys
import pickle
import numpy as np

import pandas as pd

from sklearn.feature_selection import chi2, SelectKBest

In [90]:
DATADIR = '../data'
INPATH = os.path.join(DATADIR, 'train_data.pkl')
VECTORIZER = os.path.join(DATADIR, 'vectorizer.pkl')

In [91]:
def load_data():
    with open(INPATH, 'rb') as f:
        X, y = pickle.load(f)
    with open(VECTORIZER, 'rb') as f:
        vectorizer = pickle.load(f)
    return X, y, vectorizer

In [92]:
X, y, vectorizer = load_data()

In [14]:
ch2 = SelectKBest(chi2, 100)
ch2.fit(X, y)

SelectKBest(k=100, score_func=<function chi2 at 0x1113fd7b8>)

In [54]:
def get_feature_scores(vectorizer, ch2):
    feature_names = vectorizer.get_feature_names()
    
    # (index, score)
    top_ranked = [(index, score) for (index, score)
                    in enumerate(ch2.scores_)]

    # Sort by score
    term_scores = [{'term': feature_names[idx], 'score': score} for (idx, score) in top_ranked]
    df = pd.DataFrame.from_dict(term_scores)
    df = df[['term', 'score']]
    return df


In [55]:
term_scores = get_feature_scores(vectorizer, ch2)
term_scores

Unnamed: 0,term,score
0,100,0.088620
1,130,0.001175
2,130 underlying,0.030983
3,130 underlying medical,0.030983
4,abd,0.067932
5,abd pelvis,0.084539
6,abd pelvis contrast,0.079773
7,abdomen,0.007606
8,abdomen contrast,0.020883
9,abdomen contrast ct,0.005735


In [61]:
df.to_csv('../data/term_scores.tsv', sep='\t')

In [5]:
# Read it back into a dataframe
df = pd.read_csv('../data/term_scores.tsv', sep='\t')
df.sort_values(by='score', ascending=False)

Unnamed: 0.1,Unnamed: 0,term,score
117,117,collection,1.573122e+01
288,288,hematoma,9.260500e+00
260,260,fluid collection,7.408395e+00
214,214,drainage,5.211354e+00
213,213,drain,4.842924e+00
259,259,fluid,4.317158e+00
115,115,cm,3.840119e+00
118,118,collections,3.587309e+00
106,106,chest,2.980272e+00
261,261,fluid collections,2.899816e+00


In [62]:
# Now let's transform X and y into a matrix and save it to load into R

In [93]:
data = pd.SparseDataFrame(X).fillna(0)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,672,673,674,675,676,677,678,679,680,681
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [94]:
data.columns = vectorizer.get_feature_names()

In [95]:
data.head()

Unnamed: 0,100,130,130 underlying,130 underlying medical,abd,abd pelvis,abd pelvis contrast,abdomen,abdomen contrast,abdomen contrast ct,...,wall thickening,wet,wet read,windows,woman,year,year old,year old male,year old man,year old woman
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [96]:
data['classification'] = y

In [97]:
data.head()

Unnamed: 0,100,130,130 underlying,130 underlying medical,abd,abd pelvis,abd pelvis contrast,abdomen,abdomen contrast,abdomen contrast ct,...,wet,wet read,windows,woman,year,year old,year old male,year old man,year old woman,classification
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,...,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,...,0.0,0.0,1.0,0.0,2.0,2.0,0.0,2.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0


In [98]:
data.to_dense().to_csv('../data/data_matrix.tsv', sep='\t')

In [82]:
data.fillna(0)

Unnamed: 0,100,130,130 underlying,130 underlying medical,abd,abd pelvis,abd pelvis contrast,abdomen,abdomen contrast,abdomen contrast ct,...,wet,wet read,windows,woman,year,year old,year old male,year old man,year old woman,classification
0,0.000000,0.042513,0.045337,0.045337,0.000000,0.000000,0.000000,0.061164,0.031969,0.034854,...,0.000000,0.000000,0.042042,0.000000,0.040552,0.040627,0.000000,0.060275,0.000000,0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039816,0.031216,0.034033,...,0.000000,0.000000,0.041052,0.000000,0.019798,0.019835,0.000000,0.029428,0.000000,0
2,0.083563,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027678,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.027526,0.027576,0.000000,0.040914,0.000000,0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.119216,0.046734,0.050952,...,0.000000,0.000000,0.061459,0.000000,0.059281,0.059390,0.000000,0.088113,0.000000,0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.049535,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.024631,0.024677,0.000000,0.036611,0.000000,0
5,0.000000,0.000000,0.000000,0.000000,0.042306,0.000000,0.000000,0.076801,0.080285,0.043765,...,0.000000,0.000000,0.052791,0.000000,0.025460,0.025507,0.000000,0.037843,0.000000,1
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.064251,0.033583,0.036613,...,0.000000,0.000000,0.000000,0.000000,0.042599,0.042677,0.059322,0.031659,0.000000,0
7,0.000000,0.048151,0.051349,0.051349,0.038161,0.000000,0.000000,0.092367,0.036209,0.039477,...,0.000000,0.000000,0.000000,0.000000,0.022965,0.023007,0.000000,0.034135,0.000000,0
8,0.000000,0.051015,0.000000,0.000000,0.000000,0.000000,0.000000,0.097860,0.038362,0.041824,...,0.000000,0.000000,0.050450,0.000000,0.048662,0.048751,0.000000,0.072329,0.000000,0
9,0.000000,0.000000,0.000000,0.000000,0.035912,0.000000,0.000000,0.108655,0.000000,0.000000,...,0.000000,0.000000,0.044812,0.000000,0.043224,0.043303,0.060193,0.032123,0.000000,0


In [None]:
data.replace