In [141]:
from sklearn import *
import sklearn
from helpers import *
import pandas as pd
import numpy as np

train_full = get_full_table('./input/training_variants', './input/training_text')

In [142]:
class_dict = []
for i in range(1, 10):
    class_dict.append(' '.join(list(train_full[train_full['Class']==i]['Text'])))

In [143]:
tfidf = feature_extraction.text.TfidfVectorizer(
            min_df=0.0, max_df=0.8, max_features=16000, strip_accents='unicode',
            analyzer='word', ngram_range=(1,2))

In [None]:
tfidf.fit(class_dict)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=16000, min_df=0.0,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
X = tfidf.transform(class_dict)

In [None]:
def top_tfidf_feats(row, features):
    top_ids = np.argsort(np.squeeze(row))[::-1][:10]
    top_features = [(features[i], row[i]) for i in top_ids]
    df = pd.DataFrame(top_features)
    df.columns = ['feature', 'tfidf']
    return df
                         
def top_feats_by_class(X):
    dfs = []
    features = tfidf.get_feature_names()
    for i in range(1,10):
        row = np.squeeze(X[i-1, :].toarray())
        df = top_tfidf_feats(row, features)
        df.label = i
        dfs.append(df)
    return dfs

def plot_tfidf_classfeats_h(dfs):
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12, 12), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(3, 3, i+1)
        #ax.spines["top"].set_visible(False)
        #ax.spines["right"].set_visible(False)
        #ax.set_frame_on(False)
        #ax.get_xaxis().tick_bottom()
        #ax.get_yaxis().tick_left()
        ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=16)
        ax.set_ylabel("Feature", labelpad=16, fontsize=16)
        ax.set_title("Class = " + str(df.label), fontsize=18)
        #ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.tight_layout()
    #plt.savefig('./input/feature_by_class.pdf', dpi=300)
    plt.show()

In [None]:
dfs = top_feats_by_class(X)
plot_tfidf_classfeats_h(dfs)

In [None]:
from math import *
m = np.zeros((9,9))
for i in range(9):
    for j in range(9):
        m[i][j] = np.linalg.norm(X[i,:].toarray()-X[j,:].toarray())
print(m[0][3])
print(m[1][6])

print(m[2][7])

In [None]:
import matplotlib.pyplot as plt
plt.imshow(m, cmap=plt.cm.PiYG)
classes = range(1, 10)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
#plt.savefig('./input/tfidf_distance.pdf', dpi=300)
plt.show()

In [None]:
tfidf.get_params()