# Code In PyCharm
fasttext.py

In [1]:
from gensim.test.utils import get_tmpfile
from gensim.models import FastText
import pandas as pd
import time
import numpy as np
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sbrn
from sklearn import tree
import string
import re
from sklearn.model_selection import KFold
#import pickle
import _pickle as cPickle #cPickle in Python3 comes as a library _pickle



### Definitions

In [2]:
%config IPCompleter.greedy=True #press TAB to auto complete
VECTOR_DIM = 100
failed_indexes = [] # failed sentences while fasttext tries to convert 

### Helper functions

In [4]:
def sentence_vectorizer(input_array, operand='Average'):
    if operand == 'Average':
        sentence_vector = np.average(input_array, axis=0)
    elif operand == 'Squares':
        sentence_vector = np.sqrt(np.sum(np.square(input_array), axis=0))
    elif operand == 'Both':
        sentence_vector = np.concatenate((np.average(input_array, axis=0), np.sqrt(np.sum(np.square(input_array), axis=0))), axis=0)
    return sentence_vector

def weight_generator(size, default_weight, *weights):
    weight_array = []
    for i in range(size - len(weights)):
        weight_array.append(default_weight)
    for weight in weights:
        weight_array.append(weight)
    return weight_array

In [32]:
df = pd.read_csv("7_months_v2.csv", sep=";", skipinitialspace=True, engine='python') #engine='python' required to run in ipython
df_processed = df.copy()
df_processed['sentence'] = df_processed['sentence'].str.replace('[{}]'.format(string.punctuation), ' ')
print("Punctuations are removed from sentences. Size: {}".format(df_processed.shape))

Punctuations are removed from sentences. Size: (59046, 11)


### FastText

In [None]:
def create_fasttext_model():  # took 34 mins to create
	dfCombined = get_text()
	sentences = dfCombined.values.tolist()
	corpus = []
	for sentence in sentences:
		corpus.append(str(sentence).split(' '))

	print(len(corpus))
	model = FastText(corpus, seed=31, min_count=1, size=VECTOR_DIM, sg=1, window=3, hs=1, workers=3)
	model.save("fasttext.model")

	print("Dictionary's Length : " + str(len(model.wv.vocab)))

	return model

- Get vectors of words (sentence) and apply operation to create one N sized vector that represents the sentence.

In [22]:
def convert_to_fasttext_numpy_vector(model, inputs, operation):
	w2v_array = []
	try:
		for line in range(len(inputs)):
			question = str(inputs[line][0]).strip() #[1]

			words = question.split()
			word_array = []
			for word in words:
				try:
					word_array.append(model[word])
				except Exception as ex:
					# print word + "  : " + str(ex)
					word_array.append(np.zeros(VECTOR_DIM))
			line_avg_vector = sentence_vectorizer(word_array, operation)
			w2v_array.append(line_avg_vector)
	except Exception as ex:
		print("Exception occured in convert_to_fasttext_numpy_vector: " + str(ex))
		pass

	numpy.save("7_months_clean" + operation + ".npy", w2v_array)
	return w2v_array

def convert_str_array_to_numpy_vector(model, inputs, source, brand, _type, operation):
	no_exceptions = 0
	i = 0
	w2v_array = []
	try:
		for line in range(len(inputs)):
			question = re.sub(' +', ' ', str(inputs[line]).strip()) #[1], question = str(inputs[line][0]).strip()

			words = question.split()
			word_array = []
			for word in words:
				try:
					fasttext_word = model[word] #Throws exception on unknown words
					word_array.append(fasttext_word)
				except Exception as ex:
					#print('>Sentence: ' + str(question))
					#print('>Words: ' + str(words))
					#print('Exception occured while creating numerical value for word ' + word + ': ' + str(ex))
					no_exceptions += 1
					word_array.append(np.zeros(VECTOR_DIM))
			line_avg_vector = sentence_vectorizer(word_array, operation)
			line_avg_vector[VECTOR_DIM - 3] = source[i]
			line_avg_vector[VECTOR_DIM - 2] = brand[i]
			line_avg_vector[VECTOR_DIM - 1] = _type[i]
			w2v_array.append(line_avg_vector)
			i += 1
	except Exception as ex:
		print("Exception occured in convert_to_fasttext_numpy_vector: " + str(ex))
		pass

	print('# of failed word calculation: ' + str(no_exceptions))
	#numpy.save("vector_" + operation + ".npy", w2v_array)
	return w2v_array

In [6]:
len(df['sentence'].values.tolist())

55817

In [7]:
def getVectorFromSentence(model, sentence, operation):
	str_arr = [[sentence]]
	return convert_to_fasttext_numpy_vector(model, str_arr, operation)

def generateNumpyVectorFromModel(model):
	convert_to_fasttext_numpy_vector(model, get_labeled_text().values.tolist(), 'Average')
	#convert_to_fasttext_numpy_vector(model, get_labeled_text().values.tolist(), 'Squares')
	#convert_to_fasttext_numpy_vector(model, get_labeled_text().values.tolist(), 'Both')

## Load Corpus

In [None]:
start_time = time.time()

# model = create_fasttext_model()
model = FastText.load(get_tmpfile("fasttext.model"))
print("data" + str(model.wv.most_similar(positive=["fatura"], topn=5)))
print("science" + str(model.wv.most_similar(positive=["ücret"], topn=5)))

print("--- Loading fast text model took %s seconds ---" % (time.time() - start_time))

- Convert labels with LabelEncoder, categorical to numerical conversion.

In [None]:
labels = df_processed['label'].unique()
labels[:10,]
#labels

In [None]:
df_processed = df.copy() #rollback to old form
df_processed = df_processed[['sentence','label','source','brand','type']]
df_processed[df_processed.columns] = df_processed.apply(lambda x: x.str.strip()) #strip any whitespace remaining at begin&end
df_processed.head(15)

In [42]:
print('unique sentence count: {}'.format(len(df_processed['sentence'].unique())))
print('unique label count: {}'.format(len(df_processed['label'].unique())))
print('unique source count: {}'.format(len(df_processed['source'].unique())))
print('unique brand count: {}'.format(len(df_processed['brand'].unique())))
print('unique type count: {}'.format(len(df_processed['type'].unique())))

unique sentence count: 50988
unique label count: 113
unique source count: 2
unique brand count: 2
unique type count: 2


In [55]:

le_label = preprocessing.LabelEncoder()
le_source = preprocessing.LabelEncoder()
le_brand = preprocessing.LabelEncoder()
le_type = preprocessing.LabelEncoder()
#le_label.classes_
df_processed['label'] = le_label.fit_transform(df_processed['label'].values)
df_processed['source'] = le_source.fit_transform(df_processed['source'].values)
df_processed['brand'] = le_brand.fit_transform(df_processed['brand'].values)
df_processed['type'] = le_type.fit_transform(df_processed['type'].values)
'''
#save label encoder to disk
df_le_label = 'generations\\label_encoder\\le_label.pkl'
df_le_source = 'generations\\label_encoder\\le_source.pkl'
df_le_brand = 'generations\\label_encoder\\le_brand.pkl'
df_le_type = 'generations\\label_encoder\\le_type.pkl'
with open(df_le_label, 'wb') as fid:
    cPickle.dump(le_label, fid)
with open(df_le_source, 'wb') as fid:
    cPickle.dump(le_source, fid)
with open(df_le_brand, 'wb') as fid:
    cPickle.dump(le_brand, fid)
with open(df_le_type, 'wb') as fid:
    cPickle.dump(le_type, fid)
'''
df_processed.head(15)

Unnamed: 0,sentence,label,source,brand,type
0,"[-0.13886453, -0.5717392, 0.6484746, -0.359751...",1,0,0,0
1,"[-0.21332426, -0.45715767, 0.5091378, -0.22698...",1,0,0,0
2,"[-0.088749014, -0.3895533, 0.5956384, -0.16928...",1,0,0,0
3,"[-0.2687642, -0.21038985, 0.7433767, -0.172685...",5,0,0,0
4,"[-0.25181404, -0.15667057, 0.71771836, -0.2785...",5,0,0,0
5,"[-0.1718809, -0.35072204, 0.47582346, -0.21936...",2,0,0,0
6,"[-0.22355895, -0.34208745, 0.55608106, -0.1980...",4,0,0,0
7,"[-0.20182048, -0.32203072, 0.88891727, -0.2880...",5,0,0,0
8,"[-0.3397589, -0.24134636, 0.82864255, -0.09685...",5,0,0,0
9,"[-0.13908678, -0.3037831, 0.6157272, -0.165231...",5,0,0,1


- Convert categorical input to numerical using fasttext.

In [44]:
start_time = time.time()
#generateNumpyVectorFromModel(model)
df_processed['sentence'] = convert_str_array_to_numpy_vector(model, df_processed['sentence'].values.tolist(), df_processed['source'].values.tolist(), df_processed['brand'].values.tolist(), df_processed['type'].values.tolist(), 'Average')
print("--- Categorical to numerical conversion took %s seconds ---" % (time.time() - start_time))
df_processed.head(15)



# of failed word calculation: 1293
--- Categorical to numerical conversion took 19.74518847465515 seconds ---


Unnamed: 0,sentence,label,source,brand,type
0,"[-0.13886453, -0.5717392, 0.6484746, -0.359751...",1,0,0,0
1,"[-0.21332426, -0.45715767, 0.5091378, -0.22698...",1,0,0,0
2,"[-0.088749014, -0.3895533, 0.5956384, -0.16928...",1,0,0,0
3,"[-0.2687642, -0.21038985, 0.7433767, -0.172685...",5,0,0,0
4,"[-0.25181404, -0.15667057, 0.71771836, -0.2785...",5,0,0,0
5,"[-0.1718809, -0.35072204, 0.47582346, -0.21936...",2,0,0,0
6,"[-0.22355895, -0.34208745, 0.55608106, -0.1980...",4,0,0,0
7,"[-0.20182048, -0.32203072, 0.88891727, -0.2880...",5,0,0,0
8,"[-0.3397589, -0.24134636, 0.82864255, -0.09685...",5,0,0,0
9,"[-0.13908678, -0.3037831, 0.6157272, -0.165231...",5,0,0,1


In [45]:
df_processed['sentence'][11]

array([-0.22160651, -0.3574118 ,  0.6209752 , -0.27269584,  0.14658885,
        0.55882514, -0.15553251, -0.34586847,  0.06274786,  0.24909681,
        0.10814667,  0.25191024,  0.15382251, -0.33343866, -0.01843015,
        0.07476072, -0.3059573 , -0.29260126,  0.25827533, -0.3268766 ,
        0.17340875, -0.11909965,  0.22600158,  0.19398236,  0.33184692,
        0.08558659,  0.1533467 , -0.06008039,  0.10583185, -0.29262555,
       -0.09823685, -0.31461856, -0.23317999, -0.18836676,  0.16585888,
       -0.14726113, -0.06529285,  0.04858396,  0.0595565 ,  0.32360122,
       -0.61748755,  0.2669434 , -0.22801529, -0.03403019, -0.08519905,
       -0.20898668, -0.1954992 , -0.2594698 ,  0.01181354, -0.00330374,
        0.13634294,  0.01627418,  0.35457942,  0.37490052, -0.28058755,
       -0.15595418,  0.2657585 , -0.18274245, -0.26160264,  0.3402638 ,
       -0.28744727, -0.01246708,  0.32949415,  0.34350964,  0.48714378,
       -0.11758769, -0.14325692, -0.03429096,  0.01670733, -0.48

In [46]:
df_processed.shape

(59046, 5)

## Main Code

- Split data.

In [47]:
# Split into train/test
#X = df_processed.drop('label', axis=1)
X = df_processed['sentence']
y = df_processed['label']
print('X.shape: ' + str(X.shape))
print('y.shape: ' + str(y.shape))
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print('X_train.shape: ' + str(X_train.shape))
print('y_train.shape: ' + str(y_train.shape))
print('X_test.shape: ' + str(X_test.shape))
print('y_test.shape: ' + str(y_test.shape))

X.shape: (59046,)
y.shape: (59046,)
X_train.shape: (44284,)
y_train.shape: (44284,)
X_test.shape: (14762,)
y_test.shape: (14762,)


In [48]:
X_train.head(15).values

array([array([-0.2413266 , -0.48672917,  0.54246897, -0.2245108 ,  0.08491115,
        0.4751419 , -0.3023135 , -0.4417715 , -0.00881081,  0.51322514,
       -0.09165042,  0.22655228,  0.08891266, -0.25855854, -0.1172027 ,
        0.26169094, -0.34509277, -0.04966142,  0.38289878, -0.19733302,
        0.24352117, -0.16880903,  0.5173165 ,  0.24109706,  0.37382695,
       -0.02255666,  0.04275691,  0.0267151 ,  0.20895971, -0.06791739,
       -0.0119238 , -0.04360935, -0.24439348, -0.2178795 ,  0.12160984,
       -0.13407055, -0.09032638, -0.12705067, -0.17173134,  0.26527086,
       -0.5475096 ,  0.1995205 , -0.27610692, -0.08498312, -0.10368767,
       -0.13682099, -0.3504965 , -0.1567296 ,  0.14466612,  0.04077724,
        0.5023666 , -0.07922898,  0.4941962 ,  0.3925435 , -0.35731745,
       -0.32061434, -0.04557199, -0.47219852, -0.5597373 ,  0.10401913,
       -0.49012163,  0.0702223 ,  0.3147318 ,  0.3631564 ,  0.71275455,
       -0.2981039 , -0.2337191 , -0.06235712, -0.0960146 

In [49]:
y_train.head(15).values

array([63, 40, 24, 60, 24, 24, 56, 42,  4, 24, 24, 24, 24, 24, 69])

- Select classifier.

In [50]:
classifier = OneVsOneClassifier(LinearSVC(random_state=42))
#classifier = OneVsRestClassifier(SVC(kernel='rbf', random_state=numpy.random.RandomState(12345678), cache_size=1024))
#classifier = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='ball_tree', leaf_size=60, n_jobs=3)
#classifier = OneVsOneClassifier(LinearSVC(random_state=numpy.random.RandomState(12345678)))

### Run - One Time

In [52]:
start_time = time.time()
classifier = classifier.fit(X_train.values.tolist(), y_train.values.tolist())
print("--- Train took %s seconds ---" % (time.time() - start_time))
start_time = time.time()
#print('Accuracy: ' + str(classifier.score(X_test.values.tolist(), y_test.values.tolist(), sample_weight=weight_generator(100, 0.8, 1))))
y_pred = classifier.predict(X_test.values.tolist())
print("--- Prediction took %s seconds ---" % (time.time() - start_time))
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
#print("--- Score took %s seconds ---" % (time.time() - start_time))



--- Train took 151.2292833328247 seconds ---
--- Prediction took 435.1210095882416 seconds ---
Accuracy: 0.6504478681476173


1. Save confusion matrix as text (for debug purposes)

In [22]:
tmp = confusion_matrix(le.inverse_transform(y_test), le.inverse_transform(y_pred)).astype(int)
np.savetxt("generations\\cm_1.txt", tmp, fmt='%d')
tmp

array([[ 4,  0,  0, ...,  0,  1,  0],
       [ 2,  5,  0, ...,  0,  0,  0],
       [ 0,  0, 15, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  8,  0,  0],
       [ 0,  0,  0, ...,  0, 59,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

2. Get confusion matrix contents as content_name/frequency and save it to excel manually.

In [None]:
#le.inverse_transform(y_test)
unique_elements, counts_elements = np.unique(le.inverse_transform(y_test), return_counts=True)
print(np.asarray((unique_elements, counts_elements)))
print(len(unique_elements))
print(len(counts_elements))

3. Save confusion matrix as heatmap.

In [None]:
confusion_mat = confusion_matrix(le.inverse_transform(y_test), le.inverse_transform(y_pred))
plt.figure(figsize=(42, 36))
plt.title("Confusion Matrix", y=1.05, size=15)
sbrn.heatmap(confusion_mat, annot=True, fmt="d", yticklabels=np.unique(le.inverse_transform(y_test)), xticklabels=np.unique(le.inverse_transform(y_test)))
plt.savefig("generations\\model_gen_1.png")
plt.show()

### Run - K-Fold

In [51]:
start_time = time.time()
kf = KFold(n_splits=10)

classifiers = []
y_preds = []
scores = []
train_indexes = []
test_indexes = []
X = df_processed['sentence']
y = df_processed['label']
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_indexes.append(train_index)
    test_indexes.append(test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    c = OneVsOneClassifier(LinearSVC(random_state=None)).fit(X_train.values.tolist(), y_train.values.tolist())
    y_pred = c.predict(X_test.values.tolist())
    score = accuracy_score(y_test, y_pred)
    print('Training took ' + str(time.time() - start_time) + ' seconds. Accuracy: ' + str(score))
    classifiers.append(c)
    y_preds.append(y_pred)
    scores.append(score)
print("--- K-Fold took %s seconds ---" % (time.time() - start_time))

TRAIN: [ 5905  5906  5907 ... 59043 59044 59045] TEST: [   0    1    2 ... 5902 5903 5904]
Training took 194.84069085121155 seconds. Accuracy: 0.6646909398814564
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [ 5905  5906  5907 ... 11807 11808 11809]




Training took 383.7857949733734 seconds. Accuracy: 0.6496189669771381
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [11810 11811 11812 ... 17712 17713 17714]
Training took 576.7742025852203 seconds. Accuracy: 0.636917866215072
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [17715 17716 17717 ... 23617 23618 23619]
Training took 763.6903302669525 seconds. Accuracy: 0.655376799322608
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [23620 23621 23622 ... 29522 29523 29524]
Training took 957.5782661437988 seconds. Accuracy: 0.8106689246401355
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [29525 29526 29527 ... 35427 35428 35429]




Training took 1154.5746421813965 seconds. Accuracy: 0.6552074513124471
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [35430 35431 35432 ... 41331 41332 41333]




Training took 1344.7623281478882 seconds. Accuracy: 0.38109756097560976
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [41334 41335 41336 ... 47235 47236 47237]
Training took 1573.4004561901093 seconds. Accuracy: 0.717140921409214
TRAIN: [    0     1     2 ... 59043 59044 59045] TEST: [47238 47239 47240 ... 53139 53140 53141]




Training took 1786.1700179576874 seconds. Accuracy: 0.6551490514905149
TRAIN: [    0     1     2 ... 53139 53140 53141] TEST: [53142 53143 53144 ... 59043 59044 59045]
Training took 1978.824057817459 seconds. Accuracy: 0.4657859078590786
--- K-Fold took 1978.824057817459 seconds ---


In [52]:
classifiers[4]

OneVsOneClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None)

In [None]:
#df = pd.read_csv("pd.csv", sep=";", skipinitialspace=True, engine='python') #engine='python' required to run in ipython
df.loc[X_test.head(10).index.values.tolist()]

In [None]:
y_pred_str = le_label.inverse_transform(y_pred)
#y_pred_str.astype('U')
y_pred_str

In [64]:
len(set(le_label.inverse_transform(y_pred)))

69

## Save Results
Save findings in disk which corresponds to:
- prediction_file_generationId.csv
- model_file_generationId.csv

In [56]:
generation_id = 1 # Change this line for versioning

#pred_file = 'generations\\prediction_' + str(generation_id) +'.csv'
#model_file = 'generations\\gen_2\\model_gen_' + str(generation_id) +'_0_75803995341754.pkl'
for i in range(10): # <<<- K Fold
    model_file = 'generations\\gen_2\\model_gen_' + str(i) +'_' + str(scores[i]) + '.pkl'
    with open(model_file, 'wb') as fid:
        cPickle.dump(classifiers[i], fid)

#tmp = df.loc[X_test.index.values.tolist()].copy()
#tmp['label-pred'] = y_pred_str
#tmp.to_csv(pred_file, encoding='ISO-8859-9', index=False, sep=';')

# save the classifier
#with open(model_file, 'wb') as fid:
#    cPickle.dump(classifiers[2], fid)
'''
# load it again
with open(model_file, 'rb') as fid:
    gnb_loaded = cPickle.load(fid)
'''

"\n# load it again\nwith open(model_file, 'rb') as fid:\n    gnb_loaded = cPickle.load(fid)\n"

## Removing Noise
Remove validated incorrectly labeled data from our input. 
1. Test the accuracy of the current model and compare

In [119]:
noise = pd.read_csv("differences\\noise_2_sentences.csv", sep=";", skipinitialspace=True, engine='python') #engine='python' required to run in ipython
tmp = pd.read_csv("differences\\noise_1_sentences.csv", sep=";", skipinitialspace=True, engine='python')
noise = pd.concat([noise, tmp])
noise['sentence'] = noise['sentence'].str.replace('[{}]'.format(string.punctuation), ' ')
print('Noise data size: {}'.format(noise['sentence'].size))
df_processed = df.loc[~df['sentence'].isin(noise['sentence'].values)].copy(deep=True)
print('After noise removal data size: {}'.format(df_processed.size/3))

pred_no_noise_file = 'generations\\7_months_no_noise.csv'
df_processed.to_csv(pred_no_noise_file, encoding='ISO-8859-9', index=False, sep=';')

Noise data size: 3501
After noise removal data size: 55817.0


In [125]:
noise = pd.read_csv("differences\\noise_2_sentences.csv", sep=";", skipinitialspace=True, engine='python') #engine='python' required to run in ipython
noise['sentence'] = noise['sentence'].str.replace('[{}]'.format(string.punctuation), ' ')

X_test_no_noise = df.loc[X_test.index.values.tolist()].copy()
X_test_no_noise = X_test_no_noise[~X_test_no_noise['sentence'].isin(noise['sentence'].values)]
print('X_test_no_noise size: ' + str(X_test_no_noise.size/3))

y_test_no_noise = df.loc[y_test.index.values.tolist()].copy()
y_test_no_noise = y_test_no_noise[~y_test_no_noise['sentence'].isin(noise['sentence'].values)]
print('y_test_no_noise size: ' + str(y_test_no_noise.size/3))

#Label encoding
X_test_no_noise['label'] = le.transform(X_test_no_noise['label'].values)
X_test_no_noise['source'] = preprocessing.LabelEncoder().fit_transform(X_test_no_noise['source'].values)
y_test_no_noise['label'] = le.transform(y_test_no_noise['label'].values)
y_test_no_noise['source'] = preprocessing.LabelEncoder().fit_transform(y_test_no_noise['source'].values)

X_test_no_noise['sentence'] = convert_str_array_to_numpy_vector(model, X_test_no_noise['sentence'].values.tolist(), X_test_no_noise['source'].values.tolist(), 'Average')
y_test_no_noise['sentence'] = convert_str_array_to_numpy_vector(model, y_test_no_noise['sentence'].values.tolist(), y_test_no_noise['source'].values.tolist(), 'Average')

start_time = time.time()
y_pred_no_noise = classifier.predict(X_test_no_noise['sentence'].values.tolist())
print("--- Prediction took %s seconds ---" % (time.time() - start_time))
print('Accuracy: ' + str(accuracy_score(y_test_no_noise['label'], y_pred_no_noise)))
y_pred_no_noise_str = le.inverse_transform(y_pred_no_noise)

pred_no_noise_file = 'generations\\prediction_' + str(generation_id) +'_no_noise.csv'
tmp = X_test_no_noise
tmp['label-pred'] = y_pred_no_noise_str
tmp.to_csv(pred_file, encoding='ISO-8859-9', index=False, sep=';')

X_test_no_noise size: 5594.0
y_test_no_noise size: 5594.0




# of failed word calculation: 134
# of failed word calculation: 134
--- Prediction took 115.08740496635437 seconds ---
Accuracy: 0.6841258491240615


In [126]:
y_pred_no_noise_str = le.inverse_transform(y_pred_no_noise)

pred_no_noise_file = 'generations\\prediction_' + str(generation_id) +'_no_noise.csv'
tmp = df.loc[X_test_no_noise.index.values.tolist()].copy()

tmp['label-pred'] = y_pred_no_noise_str
tmp.to_csv(pred_no_noise_file, encoding='ISO-8859-9', index=False, sep=';')

2. Re-train a model and measure accuracy of that model

##### Prepare the data.

In [155]:
df_no_noise = df[~df['sentence'].isin(noise['sentence'].values)]
print('df_no_noise size: ' + str(df_no_noise.size/3))

df_no_noise size: 57239.0


In [156]:
start_time = time.time()
df_no_noise['label'] = le.transform(df_no_noise['label'].values)
df_no_noise['source'] = preprocessing.LabelEncoder().fit_transform(df_no_noise['source'].values)
df_no_noise['sentence'] = convert_str_array_to_numpy_vector(model, df_no_noise['sentence'].values.tolist(), df_no_noise['source'].values.tolist(), 'Average')
print("--- Data preparation took %s seconds ---" % (time.time() - start_time))
df_no_noise.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# of failed word calculation: 1296
--- Data preparation took 24.467040538787842 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,sentence,label,source
0,"[-0.13886453, -0.5717392, 0.6484746, -0.359751...",1,0
1,"[-0.21332426, -0.45715767, 0.5091378, -0.22698...",1,0
2,"[-0.088749014, -0.3895533, 0.5956384, -0.16928...",1,0
3,"[-0.2687642, -0.21038985, 0.7433767, -0.172685...",5,0
4,"[-0.25181404, -0.15667057, 0.71771836, -0.2785...",5,0
5,"[-0.1718809, -0.35072204, 0.47582346, -0.21936...",2,0
6,"[-0.22355895, -0.34208745, 0.55608106, -0.1980...",4,0
7,"[-0.20182048, -0.32203072, 0.88891727, -0.2880...",5,0
8,"[-0.3397589, -0.24134636, 0.82864255, -0.09685...",5,0
9,"[-0.13908678, -0.3037831, 0.6157272, -0.165231...",5,0


##### Split to train and test data

In [157]:
X = df_no_noise['sentence'].copy(deep=True)
y = df_no_noise['label'].copy(deep=True)
print('X.shape: ' + str(X.shape))
print('y.shape: ' + str(y.shape))
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=None)
print('X_train.shape: ' + str(X_train.shape))
print('y_train.shape: ' + str(y_train.shape))
print('X_test.shape: ' + str(X_test.shape))
print('y_test.shape: ' + str(y_test.shape))

X.shape: (57239,)
y.shape: (57239,)
X_train.shape: (42929,)
y_train.shape: (42929,)
X_test.shape: (14310,)
y_test.shape: (14310,)


##### Train and test: One Time Run

In [158]:
start_time = time.time()
classifier_no_noise = OneVsOneClassifier(LinearSVC(random_state=None))
classifier_no_noise = classifier_no_noise.fit(X_train.values.tolist(), y_train.values.tolist())
print("--- Train took %s seconds ---" % (time.time() - start_time))
start_time = time.time()
#print('Accuracy: ' + str(classifier.score(X_test.values.tolist(), y_test.values.tolist(), sample_weight=weight_generator(100, 0.8, 1))))
y_pred_no_noise = classifier_no_noise.predict(X_test.values.tolist())
print("--- Prediction took %s seconds ---" % (time.time() - start_time))
print('Accuracy: ' + str(accuracy_score(y_test, y_pred_no_noise)))
y_pred_no_noise_str = le.inverse_transform(y_pred_no_noise)

--- Train took 88.18103456497192 seconds ---
--- Prediction took 310.9050850868225 seconds ---
Accuracy: 0.6545073375262055


##### Train and test: K-Fold

## Predict External Data With Pre-Existing Classifier

In [None]:
start_time = time.time()
fp_classifier = "generations\\model_gen_1_0_75803995341754.pkl"
fp_data = "fp_data.csv"
fp_pred = "fp_pred.txt"

#
#DO NOT ALTER BELOW
#
#load pre-existing classifier from disk
with open(fp_classifier, 'rb') as fid:
    _classifier = cPickle.load(fid)
print('Classifier: ' + str(_classifier))

#load data
_df = pd.read_csv(fp_data, sep=";", skipinitialspace=True, engine='python') #engine='python' required to run in ipython
print(_df.head(10))

#prepare
_df['source'] = preprocessing.LabelEncoder().fit_transform(_df['source'].values)
_df['sentence'] = convert_str_array_to_numpy_vector(model, _df['sentence'].values.tolist(), _df['source'].values.tolist(), 'Average')
print('Data size: ' + str(_df.size / _df.columns.size))

#predict
_y_pred = _classifier.predict(_df['sentence'].values.tolist())
_y_pred = le.inverse_transform(_y_pred)
np.savetxt(fp_pred, _y_pred, delimiter=',', fmt='%s')

print("--- Prediction took %s seconds ---" % (time.time() - start_time))