In [121]:
import pandas as pd
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.preprocessing import LabelEncoder
import operator
import os
from collections import Counter
import re
import numpy as np

In [18]:
from matplotlib import pyplot as plt

In [48]:
chemia_files = [
    'data/estate_with_class.csv',
    'data/extended_with_class.csv',
    'data/fingerprinter_with_class.csv',
    'data/maccs_with_class.csv',
    'data/graphonly_with_class.csv',
    'data/KRFP_with_class.csv',
    'data/pubchem_with_class.csv',
    'data/substructure_with_class.csv'
]

In [142]:
chemia_file = chemia_files[0]

In [155]:
def csv_to_svmlight(chemia_file):
    chemia_df = pd.read_csv(chemia_file, header=None)
    chemia_df.set_index(chemia_df.columns[0], drop=True, inplace=True)

    X = chemia_df.values[:,:-1]
    X = X.astype(float)
    y = chemia_df.values[:,-1]

    print(chemia_file)


    not_nan_index = np.isfinite(X).any(axis=1)
    print(not_nan_index.sum())

    y_letters = np.array(map(lambda (k): re.match('\(([A-Z]{1,2})\)', k).groups()[0], y))
    top_10_classes = sorted(map(operator.itemgetter(0), 
                                sorted(Counter(y_letters).iteritems(), key=lambda (k,v): v, reverse=True)[:10]))
    top_10_index = pd.Series(y_letters).isin(top_10_classes).values

    index = top_10_index & not_nan_index
    print('taking %d out of %d examples' % (index.sum(), len(index)))

    X_top_10 = X[index]
    y_top_10 = LabelEncoder().fit(top_10_classes).transform(y_letters[index])

    filename = re.sub('_.*\.csv', '_top_10.libsvm', chemia_file)
    dump_svmlight_file(X_top_10, y_top_10, f=filename, comment='%d' % X.shape[1])

    # now testing
    n_features = None
    with open(filename, 'r') as svmlight_file:
        next(svmlight_file)
        next(svmlight_file)
        next(svmlight_file)
        n_features_line = next(svmlight_file)

        match = re.match('#\s([0-9]+)', n_features_line)
        if match:
            n_features = int(match.groups()[0])
    X_saved, y_saved = load_svmlight_file(filename, n_features=n_features)
    assert X_saved.shape == X_top_10.shape


In [147]:
X.astype(float)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [156]:
for chemia_file in chemia_files:
    csv_to_svmlight(chemia_file)
    print(chemia_file)

data/estate_with_class.csv
3695
taking 2720 out of 3696 examples
data/estate_with_class.csv
data/extended_with_class.csv
3696
taking 2721 out of 3696 examples
data/extended_with_class.csv
data/fingerprinter_with_class.csv
3696
taking 2721 out of 3696 examples
data/fingerprinter_with_class.csv
data/maccs_with_class.csv
3696
taking 2721 out of 3696 examples
data/maccs_with_class.csv
data/graphonly_with_class.csv
3696
taking 2721 out of 3696 examples
data/graphonly_with_class.csv
data/KRFP_with_class.csv
3696
taking 2721 out of 3696 examples
data/KRFP_with_class.csv
data/pubchem_with_class.csv
3696
taking 2721 out of 3696 examples
data/pubchem_with_class.csv
data/substructure_with_class.csv
3696
taking 2721 out of 3696 examples
data/substructure_with_class.csv


In [158]:
with open('data/datasets_chemia.txt', 'w') as f:
    f.write('\n'.join([re.sub('_.*\.csv', '_top_10.libsvm', chemia_file) for chemia_file in chemia_files]))