In [1]:
import pandas as pd
import numpy as np
import re


In [2]:
train_dataset = pd.read_csv('dataset/train.csv')
test_dataset = pd.read_csv('dataset/test.csv')

print('train dataset shape: ', train_dataset.shape)
print('train dataset shape: ', test_dataset.shape)
print('train columns: ', train_dataset.columns)
print('test columns: ', test_dataset.columns)

train dataset shape:  (9349, 3)
train dataset shape:  (493, 2)
train columns:  Index(['gold_label', 'sentence1', 'sentence2'], dtype='object')
test columns:  Index(['sentence1', 'sentence2'], dtype='object')


In [3]:
train_dataset.head(3)

Unnamed: 0,gold_label,sentence1,sentence2
0,contradiction,A group of men with cowboy hats are watching a...,A group of women are watching a horse race
1,entailment,A kid with his head shaved sitting and holding...,Child sits with a cigarette.
2,neutral,A woman riding a bicycle past a car and a grou...,A woman riding a bicycle near a group of her f...


In [4]:
train_dataset.iloc[8]

gold_label                                        contradiction
sentence1     Young children and parents splashing around a ...
sentence2                       There are families eating lunch
Name: 8, dtype: object

In [5]:
# any cleaning needed?
train_dataset.sentence1.str.contains(r'[^A-Za-z\-]').any()

True

In [6]:
# rem all non alphabetic characters, and lower case them
train_dataset.sentence1 = train_dataset.sentence1.apply(lambda sent: 
                                                        re.sub("[^A-Za-z\-']+", ' ', str(sent)).lower().replace("'", ''))
train_dataset.sentence2 = train_dataset.sentence2.apply(lambda sent: 
                                                        re.sub("[^A-Za-z\-']+", ' ', str(sent)).lower().replace("'", ''))

In [7]:
train_dataset.gold_label.unique()

array(['contradiction', 'entailment', 'neutral'], dtype=object)

In [8]:
# few labels, so label encoding directly
# print("before encoding:")
# print(train_dataset.gold_label[:5])
train_dataset.gold_label = train_dataset.gold_label.map({'contradiction':0, 'entailment':1, 'neutral':2})

In [9]:
# print("after encoding:")
# train_dataset.gold_label[:5]

In [10]:
train_dataset.sample(5)

Unnamed: 0,gold_label,sentence1,sentence2
1481,0,a man drags a young boy in swim trunks into th...,a man is baking a loaf of bread
1196,0,a man and a woman are talking in a park,a man and woman are talking in the aquarium
6306,0,four guys with guns and army gear,the four guys are playing hopscotch
2596,0,a man wearing a white and blue shirt cooking c...,a group of people are watching some ducks at t...
8639,1,rival female basketball teams are playing a ga...,two female basketball teams are playing a game...


loading the pre baked w2v model with weights<br>
[Download link](https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download)

In [11]:
from gensim.models import KeyedVectors
from smart_open import open

max_vocab_size = 100000

w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors/GoogleNews-vectors-negative300.bin', 
                                              binary=True, limit=max_vocab_size)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
w2v_model['man']

array([ 0.32617188,  0.13085938,  0.03466797, -0.08300781,  0.08984375,
       -0.04125977, -0.19824219,  0.00689697,  0.14355469,  0.0019455 ,
        0.02880859, -0.25      , -0.08398438, -0.15136719, -0.10205078,
        0.04077148, -0.09765625,  0.05932617,  0.02978516, -0.10058594,
       -0.13085938,  0.001297  ,  0.02612305, -0.27148438,  0.06396484,
       -0.19140625, -0.078125  ,  0.25976562,  0.375     , -0.04541016,
        0.16210938,  0.13671875, -0.06396484, -0.02062988, -0.09667969,
        0.25390625,  0.24804688, -0.12695312,  0.07177734,  0.3203125 ,
        0.03149414, -0.03857422,  0.21191406, -0.00811768,  0.22265625,
       -0.13476562, -0.07617188,  0.01049805, -0.05175781,  0.03808594,
       -0.13378906,  0.125     ,  0.0559082 , -0.18261719,  0.08154297,
       -0.08447266, -0.07763672, -0.04345703,  0.08105469, -0.01092529,
        0.17480469,  0.30664062, -0.04321289, -0.01416016,  0.09082031,
       -0.00927734, -0.03442383, -0.11523438,  0.12451172, -0.02

In [13]:
embeddings = w2v_model.vectors[:max_vocab_size,:]
embeddings = np.concatenate((np.zeros((1,300)), embeddings))
embeddings.shape

(100001, 300)

In [14]:
word2id = {word:ix + 1 for ix, word in enumerate(w2v_model.index2word) if ix < max_vocab_size}

In [15]:
print('word id: {}'.format(word2id['man']))
print('word vector:', embeddings[word2id['man']])

word id: 252
word vector: [ 0.32617188  0.13085938  0.03466797 -0.08300781  0.08984375 -0.04125977
 -0.19824219  0.00689697  0.14355469  0.0019455   0.02880859 -0.25
 -0.08398438 -0.15136719 -0.10205078  0.04077148 -0.09765625  0.05932617
  0.02978516 -0.10058594 -0.13085938  0.001297    0.02612305 -0.27148438
  0.06396484 -0.19140625 -0.078125    0.25976562  0.375      -0.04541016
  0.16210938  0.13671875 -0.06396484 -0.02062988 -0.09667969  0.25390625
  0.24804688 -0.12695312  0.07177734  0.3203125   0.03149414 -0.03857422
  0.21191406 -0.00811768  0.22265625 -0.13476562 -0.07617188  0.01049805
 -0.05175781  0.03808594 -0.13378906  0.125       0.0559082  -0.18261719
  0.08154297 -0.08447266 -0.07763672 -0.04345703  0.08105469 -0.01092529
  0.17480469  0.30664062 -0.04321289 -0.01416016  0.09082031 -0.00927734
 -0.03442383 -0.11523438  0.12451172 -0.0246582   0.08544922  0.14355469
 -0.27734375  0.03662109 -0.11035156  0.13085938 -0.01721191 -0.08056641
 -0.00708008 -0.02954102  0.300

In [16]:
#Indexing the sentences words by the w2v dictionary
def preprocess_sentence(sentence, word2id, other_id=0):
    sentence = sentence.split(' ')
    sentence = np.array([word2id[c] if c in word2id else other_id for c in sentence])
    return sentence

In [17]:
train_dataset.sentence1 = train_dataset.sentence1.apply(lambda x: preprocess_sentence(x, word2id))
train_dataset.sentence2 = train_dataset.sentence2.apply(lambda x: preprocess_sentence(x, word2id))

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=29)

for train_index, test_index in split.split(train_dataset, train_dataset['gold_label']):
    train_set = train_dataset.loc[train_index]
    test_set = train_dataset.loc[test_index]

In [20]:
train_dataset['gold_label'].value_counts() / len(train_dataset)

1    0.338646
0    0.333084
2    0.328270
Name: gold_label, dtype: float64

In [21]:
train_set['gold_label'].value_counts() / len(train_set)

1    0.338682
0    0.333066
2    0.328252
Name: gold_label, dtype: float64

In [23]:
train_set.sample(2)

Unnamed: 0,gold_label,sentence1,sentence2
6737,2,"[0, 252, 2, 0, 1619, 5872, 2976, 28, 4866, 0]","[47067, 11, 2550, 0]"
9165,0,"[534, 1557, 2, 32825, 1201, 2, 32232, 587]","[0, 1557, 17088, 807, 19, 0, 2833, 0]"


In [26]:
max(train_set.sentence1.map(len)), max(train_set.sentence2.map(len))

(51, 36)

In [39]:
from keras.preprocessing.sequence import pad_sequences

# max_seq_len = 51

train_sent1 = pad_sequences(train_set.sentence1)#, maxlen=max_seq_len)
train_sent2 = pad_sequences(train_set.sentence2)#, maxlen=max_seq_len)

In [40]:
train_dataset.sample(1)

Unnamed: 0,gold_label,sentence1,sentence2
2039,2,"[135, 438, 2, 1619, 24411, 9, 12363, 6, 83, 77...","[135, 438, 20, 2560, 120, 0, 4891, 6, 25940, 1..."


In [41]:
train_set.iloc[3839]

gold_label                                                    1
sentence1     [55, 534, 271, 2, 2837, 13864, 46, 9, 12, 247,...
sentence2         [55, 810, 2, 16995, 13864, 8515, 31, 1158, 0]
Name: 5666, dtype: object

In [42]:
train_sent1.shape

(7479, 51)

In [43]:
len(train_set)

7479

In [48]:
X = np.c_[train_sent1, train_sent2]
y = train_set.gold_label.values

In [52]:
X[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,  4891, 23661,     2,     0,
        5202,   136,    12,   283,   708,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0, 40204,   109,
          12,  8285,     2,    12,  9068,     0])

In [53]:
# scalling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [54]:
X_scaled[0]

array([ 0.        , -0.02047725, -0.02187414, -0.02617566, -0.03228823,
       -0.02144495, -0.03854952, -0.03129755, -0.03653224, -0.0338778 ,
       -0.0417775 , -0.02691258, -0.03622052, -0.02914081, -0.04478978,
       -0.02727775, -0.03617314, -0.04218171, -0.0417029 , -0.04470584,
       -0.04793425, -0.04454711, -0.04878459, -0.07841084, -0.06318903,
       -0.07095275, -0.08122354, -0.08529464, -0.09860963, -0.11835802,
       -0.11816616, -0.14997524, -0.1590034 , -0.17615721, -0.20210973,
       -0.20527577, -0.22176935, -0.23554356, -0.2603899 , -0.27524081,
       -0.29745527,  0.30935091,  2.16958851, -0.36794097, -0.36222745,
        0.15975106, -0.38003913, -0.317211  , -0.31239207, -0.51059836,
       -0.12865131, -0.01156398, -0.01156398, -0.01642415, -0.01489845,
       -0.01173215, -0.01598143, -0.01354744, -0.01197678, -0.01164699,
       -0.01215165, -0.01672854, -0.01166793, -0.02195524, -0.01512528,
       -0.01745712, -0.01527309, -0.02538173, -0.02496408, -0.03

In [55]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=29)
sgd_clf.fit(X_scaled, y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=29, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [56]:
test_sent1 = pad_sequences(test_set.sentence1)
test_sent2 = pad_sequences(test_set.sentence2)

X_test = np.c_[test_sent1, test_sent2]

X_test_scaled = scaler.fit_transform(X_test)



In [60]:
sgd_clf.predict([X_test_scaled[29]])

ValueError: X has 98 features per sample; expecting 87