In [1]:
import pandas as pd
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from gensim.models import word2vec

In [2]:
test = pd.read_csv("./data/final.tsv", sep = "\t", header = None, quoting=csv.QUOTE_NONE)

In [3]:
test.columns = "context_id,context_2,context_1,context_0,reply_id,reply".split(",")

In [4]:
test = test.replace(np.nan, '', regex=True)

In [5]:
test['text'] = test['context_2'] + " " + test['context_1'] + ' ' + test['context_0'] + ' ' + test['reply']

In [6]:
test['text_w2v'] = test['text'].apply(lambda x: x.strip().split())

In [7]:
test['reply_w2v'] = test['reply'].apply(lambda x: x.strip().split())

In [8]:
test['question'] = test['context_2'] + " " + test['context_1'] + ' ' + test['context_0']

In [9]:
test['question_w2v'] = test['question'].apply(lambda x: x.strip().split())

In [10]:
df = pd.read_csv("./data/train.tsv", sep = "\t", header = None, quoting=csv.QUOTE_NONE)

In [11]:
df.columns = "context_id,context_2,context_1,context_0,reply_id,reply,label,confidence".split(",")

In [12]:
df = df.replace(np.nan, '', regex=True)

In [13]:
df['text'] = df['context_2'] + " " + df['context_1'] + ' ' + df['context_0'] + ' ' + df['reply']

In [14]:
df['text_w2v'] = df['text'].apply(lambda x: x.strip().split())

In [15]:
df['reply_w2v'] = df['reply'].apply(lambda x: x.strip().split())

In [16]:
df['question'] = df['context_2'] + " " + df['context_1'] + ' ' + df['context_0']

In [17]:
df['question_w2v'] = df['question'].apply(lambda x: x.strip().split())

In [18]:
def get_target(x):
    if x == 'good':
        return 2
    if x == 'bad':
        return 0
    if x == 'neutral':
        return 1

In [19]:
df['target'] = df['label'].apply(lambda x: get_target(x))

In [20]:
y = df['target']

In [21]:
#объединим обучающую и тестовую выборки и обучим нашу модель на всех данных 
#с размером окна в 6=3*2(длина предложения 10 слов) и итоговыми векторами размерности 300, параметр workers отвечает за колчество ядер
test['target'] = -1
test['confidence'] = -1
data = pd.concat([df,test],axis=0)

model = word2vec.Word2Vec(data['text_w2v'], size=300, window=3, workers=4)
#создадим словарь со словами и соответсвующими им векторами
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  if __name__ == '__main__':


In [22]:
class mean_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(w2v.values())))
    
    def fit(self, X):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [23]:
data_mean=mean_vectorizer(w2v).transform(df['question_w2v'])
data_mean.shape

(97533, 300)

In [24]:
data_mean2 = mean_vectorizer(w2v).transform(df['reply_w2v'])

In [25]:
data_mean = np.hstack([data_mean, data_mean2])

In [26]:
data_mean.shape

(97533, 600)

In [27]:
test_mean = mean_vectorizer(w2v).transform(test['question_w2v'])

In [28]:
test_mean2 = mean_vectorizer(w2v).transform(test['reply_w2v'])

In [29]:
test_mean = np.hstack([test_mean, test_mean2])

In [30]:
test_mean.shape

(104834, 600)

In [31]:
#пропишем класс выполняющий tfidf преобразование.
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [39]:
tfidf = tfidf_vectorizer(w2v).fit(df['text_w2v'])

In [40]:
data_mean = tfidf.transform(df['question_w2v'])

In [41]:
data_mean2 = tfidf.transform(df['reply_w2v'])

In [42]:
test_mean = tfidf.transform(test['question_w2v'])

In [43]:
test_mean2 = tfidf.transform(test['reply_w2v'])

In [32]:
#Воспользуемся валидацией, как в 4 дз курса ODS
def split(train,y,ratio):
    idx = round(train.shape[0] * ratio)
    return train[:idx, :], train[idx:, :], y[:idx], y[idx:]
Xtr, Xval, ytr, yval = split(data_mean, y,0.8)
Xtr.shape,Xval.shape,ytr.mean(),yval.mean()

((78026, 600), (19507, 600), 1.1710199164381103, 1.1740400881734763)

In [34]:
from catboost import CatBoostClassifier

In [44]:
ctb = CatBoostClassifier(iterations=250, learning_rate = 0.1, loss_function='MultiClass')

In [45]:
%%time
ctb.fit(data_mean, y);
predicted = ctb.predict(test_mean)

0:	learn: -1.0665706	total: 587ms	remaining: 2m 26s
1:	learn: -1.0397323	total: 1.19s	remaining: 2m 27s
2:	learn: -1.0175476	total: 1.82s	remaining: 2m 29s
3:	learn: -0.9986223	total: 2.44s	remaining: 2m 29s
4:	learn: -0.9826308	total: 3.05s	remaining: 2m 29s
5:	learn: -0.9688914	total: 3.67s	remaining: 2m 29s
6:	learn: -0.9568110	total: 4.28s	remaining: 2m 28s
7:	learn: -0.9465151	total: 4.9s	remaining: 2m 28s
8:	learn: -0.9370151	total: 5.53s	remaining: 2m 28s
9:	learn: -0.9291999	total: 6.13s	remaining: 2m 27s
10:	learn: -0.9223522	total: 6.73s	remaining: 2m 26s
11:	learn: -0.9160967	total: 7.33s	remaining: 2m 25s
12:	learn: -0.9107832	total: 7.92s	remaining: 2m 24s
13:	learn: -0.9057472	total: 8.52s	remaining: 2m 23s
14:	learn: -0.9015536	total: 9.12s	remaining: 2m 22s
15:	learn: -0.8977391	total: 9.78s	remaining: 2m 23s
16:	learn: -0.8938831	total: 10.4s	remaining: 2m 22s
17:	learn: -0.8907033	total: 11s	remaining: 2m 21s
18:	learn: -0.8880538	total: 11.6s	remaining: 2m 20s
19:	le

154:	learn: -0.8166180	total: 1m 39s	remaining: 1m 1s
155:	learn: -0.8163682	total: 1m 40s	remaining: 1m
156:	learn: -0.8161168	total: 1m 40s	remaining: 59.8s
157:	learn: -0.8159201	total: 1m 41s	remaining: 59.1s
158:	learn: -0.8157062	total: 1m 42s	remaining: 58.4s
159:	learn: -0.8154804	total: 1m 42s	remaining: 57.7s
160:	learn: -0.8152409	total: 1m 43s	remaining: 57.1s
161:	learn: -0.8150033	total: 1m 43s	remaining: 56.4s
162:	learn: -0.8147017	total: 1m 44s	remaining: 55.8s
163:	learn: -0.8144441	total: 1m 45s	remaining: 55.3s
164:	learn: -0.8142046	total: 1m 45s	remaining: 54.6s
165:	learn: -0.8140046	total: 1m 46s	remaining: 54s
166:	learn: -0.8137374	total: 1m 47s	remaining: 53.3s
167:	learn: -0.8134532	total: 1m 48s	remaining: 52.8s
168:	learn: -0.8131617	total: 1m 48s	remaining: 52.1s
169:	learn: -0.8128750	total: 1m 49s	remaining: 51.5s
170:	learn: -0.8126494	total: 1m 49s	remaining: 50.8s
171:	learn: -0.8124162	total: 1m 50s	remaining: 50.1s
172:	learn: -0.8121970	total: 1m 

In [2]:
!pip install setuptools

[31mtensorboard 1.6.0 has requirement bleach==1.5.0, but you'll have bleach 2.1.2 which is incompatible.[0m
[31mtensorboard 1.6.0 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.[0m


In [1]:
import xgboost as xgb

ImportError: No module named 'xgboost'

In [39]:
dtr = xgb.DMatrix(Xtr, label= ytr,missing = np.nan)
dval = xgb.DMatrix(Xval, label= yval,missing = np.nan)
watchlist = [(dtr, 'train'), (dval, 'eval')]
history = dict()

In [40]:
params = {
    'max_depth': 26,
    'eta': 0.025,
    'nthread': 4,
    'gamma' : 1,
    'alpha' : 1,
    'subsample': 0.85,
    'objective': 'multi:softmax',
    'seed':7,
    'num_class':3
}

In [41]:
model_new = xgb.train(params, dtr, num_boost_round=40, evals=watchlist,evals_result=history, verbose_eval=20)

[0]	train-merror:0.145298	eval-merror:0.477931
[20]	train-merror:0.031836	eval-merror:0.40811
[39]	train-merror:0.023531	eval-merror:0.404163


In [42]:
predicted = model_new.predict(xgb.DMatrix(test_mean))

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
lr = LogisticRegression(random_state=17, n_jobs=-1).fit(data_mean, y)

  " = {}.".format(self.n_jobs))


In [59]:
predicted = lr.predict(test_mean)

In [53]:
svc = LinearSVC(loss='hinge', random_state=17,)

In [54]:
svc.fit(data_mean, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=17, tol=0.0001, verbose=0)

In [55]:
predicted = svc.predict(test_mean)

In [43]:
test['target'] = predicted
test['target'] = test['target'].astype('int')
test = test.sort_values(['context_id', 'target'], ascending=[True, False])
test[['context_id','reply_id']].to_csv('submission_3.tsv', index=False, header=None, sep='\t')