In [1]:
import numpy as np
import sklearn
import scipy
import matplotlib.pyplot as plt
from load_data import *
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
import time

## SNIPS

### Preprocessing

In [2]:
TRAIN_PATH = './snips/snips_train_actual.csv'
TEST_PATH = './snips/snips_test_actual.csv'

In [3]:
data_loader = SnipsDataLoader(TRAIN_PATH, None, TEST_PATH)
data_loader.split_train_valid(valid_size=0.05, keep_class_ratios=True)

In [4]:
X_train, y_train = data_loader.get_train_data()
X_valid, y_valid = data_loader.get_valid_data()

In [5]:
feature_extractor = FeatureExtractor(X_train, X_valid)
feature_extractor.extract_features(keep_words_threshold=5)
X_train = feature_extractor.get_train_encodings()
X_valid = feature_extractor.get_valid_encodings()

### Training

In [14]:
start = time.time()
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
print("time_elapsed: %f"%(time.time() - start))
clf.fit(X_train, y_train)
print("time_elapsed: %f"%(time.time() - start))

time_elapsed: 0.001826
time_elapsed: 251.068045


### Evaluating

In [16]:
clf.score(X_train, y_train)

0.989384450893539

In [18]:
clf.score(X_valid, y_valid)

0.9289855072463769

In [31]:
f1_score_train = sklearn.metrics.f1_score(y_train, clf.predict(X_train), average = 'weighted')
f1_score_val = sklearn.metrics.f1_score(y_valid, clf.predict(X_valid), average = 'weighted')



In [32]:
print("f1_score_train: ", f1_score_train)
print("f1_score_val: ", f1_score_val)

f1_score_train:  0.9893830851950337
f1_score_val:  0.9291478910449119


## ATIS (mini)

### Preprocessing

In [33]:
TRAIN_PATH = './atis/atis_train_actual.csv'
TEST_PATH = './atis/atis_test_actual.csv'

In [34]:
data_loader = SnipsDataLoader(TRAIN_PATH, None, TEST_PATH)
data_loader.split_train_valid(valid_size=0.05, keep_class_ratios=True)

In [35]:
X_train, y_train = data_loader.get_train_data()
X_valid, y_valid = data_loader.get_valid_data()

In [36]:
feature_extractor = FeatureExtractor(X_train, X_valid)
feature_extractor.extract_features(keep_words_threshold=5)
X_train = feature_extractor.get_train_encodings()
X_valid = feature_extractor.get_valid_encodings()

### Training

In [37]:
start = time.time()
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
print("time_elapsed: %f"%(time.time() - start))
clf.fit(X_train, y_train)
print("time_elapsed: %f"%(time.time() - start))

time_elapsed: 0.007082
time_elapsed: 10.476098


### Evaluating

In [38]:
clf.score(X_train, y_train)

0.9897648083623694

In [39]:
clf.score(X_valid, y_valid)

0.9214876033057852

In [40]:
f1_score_train = sklearn.metrics.f1_score(y_train, clf.predict(X_train), average = 'weighted')
f1_score_val = sklearn.metrics.f1_score(y_valid, clf.predict(X_valid), average = 'weighted')




In [41]:
print("f1_score_train: ", f1_score_train)
print("f1_score_val: ", f1_score_val)

f1_score_train:  0.989719490374377
f1_score_val:  0.9078530032323477
