# Ранжирование

[Ссылка на соревнование](https://www.kaggle.com/c/changellenge-cupit-2019-posneg)

In [0]:
import pandas as pd  
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

import warnings
warnings.simplefilter('ignore')

## Загрузка данных

In [0]:
!mkdir ~/.kaggle
!mv kaggle.json /root/.kaggle
!kaggle competitions download -c changellenge-cupit-2019-posneg

## Подготовка данных

In [0]:
train_df = pd.read_csv('train_data.csv.zip', index_col=0)
train_df.head()

Unnamed: 0,title,text,score
0,Просьба об отмене штрафа КАСКО,Здравствуйте! 10.06.2016г. мною был заключен д...,Позитивный
1,"В целом хорошо, работали по ипотеке.","Брала ипотеку в юникоре, порадовали квалифика...",Негативный
2,Нас всегда выручает банк Уралсиб!,Знакомство с банком началось с ипотеки в 2009 ...,Позитивный
3,Абсолютно неприемлемая ситуация и ее решение,Клиент Совкомбанка с 2017 года. В феврале 2018...,Позитивный
4,Внёс не туда,Пришёл оплачивать кредит и ошибочно внёс деньг...,Позитивный


In [0]:
train_df['score'] = train_df['score'].map({'Позитивный': 1, 'Негативный': 0})
train_df['data'] = train_df['title'] + ' ' + train_df['text']
train_df.drop(['title', 'text'], axis=1, inplace=True)

train_df.rename(index=str, columns={"data": "text"}, inplace=True)
train_df.rename(index=str, columns={"score": "target"}, inplace=True)
train_df.head()

Unnamed: 0,target,text
0,1,Просьба об отмене штрафа КАСКО Здравствуйте! 1...
1,0,"В целом хорошо, работали по ипотеке. Брала ип..."
2,1,Нас всегда выручает банк Уралсиб! Знакомство с...
3,1,Абсолютно неприемлемая ситуация и ее решение К...
4,1,Внёс не туда Пришёл оплачивать кредит и ошибоч...


In [0]:
train_df.dropna(inplace=True)
train_df.reset_index(drop=True,inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
target    10000 non-null int64
text      10000 non-null object
dtypes: int64(1), object(1)
memory usage: 156.3+ KB


In [0]:
x = train_df.text
y = train_df.target

In [0]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=.2, random_state=7)

In [0]:
print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print ("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))

Train set has total 8000 entries with 25.40% negative, 74.60% positive
Validation set has total 2000 entries with 27.15% negative, 72.85% positive


## TFIDF Bag of Words

In [0]:
n_features = 150000

In [0]:
tf_idf = TfidfVectorizer(max_features=n_features,ngram_range=(1, 2))
tf_idf.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [0]:
x_train_tfidf = tf_idf.transform(x_train)
x_validation_tfidf = tf_idf.transform(x_validation)

## Создание нейронной сети

In [0]:
seed = 7
np.random.seed(seed)

In [0]:
%%time
model = Sequential()
model.add(Dense(16, activation='relu', input_dim=n_features))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(x_train_tfidf, y_train, 
          validation_data=(x_validation_tfidf, y_validation),
          batch_size=32,
          epochs=5)

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 55 s, sys: 9.73 s, total: 1min 4s
Wall time: 1min 10s


In [0]:
def create_model():
  model = Sequential()
  model.add(Dense(16, activation='relu', input_dim=n_features))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(model, x_train_tfidf, y_train, cv=kfold, scoring='roc_auc')
print(results.mean())

0.93269479901958


## Тренировка на всех данных

In [0]:
test_df = pd.read_csv('test_data.csv.zip', index_col=0)
test_df['data'] = test_df['title'] + ' ' + test_df['text']

test_df.drop(['title', 'text'], axis=1, inplace=True)

test_df.rename(index=str, columns={"data": "text"}, inplace=True)
test_df.rename(index=str, columns={"score": "target"}, inplace=True)
test_df.dropna(inplace=True)
test_df.reset_index(drop=True,inplace=True)

test_df.head()

Unnamed: 0,text
0,Благодарность менеджеру по продажам Хотелось б...
1,Брал рассрочку в магазине Самсунг Здравствуйте...
2,Тинькофф оказался лучшим Работаем с ними уже б...
3,Недавно открыла расчетный счет в Тинькофф Банк...
4,Волею судеб у меня появилась кредитная карта С...


In [0]:
tf_idf_full = TfidfVectorizer(max_features=n_features,ngram_range=(1, 2))
tf_idf_full.fit(train_df.text)

x_full_train_tfidf = tf_idf_full.transform(train_df.text)
x_full_test_tfidf = tf_idf_full.transform(test_df.text)

In [0]:
x_full_train_tfidf, x_full_test_tfidf

(<10000x150000 sparse matrix of type '<class 'numpy.float64'>'
 	with 1541734 stored elements in Compressed Sparse Row format>,
 <3000x150000 sparse matrix of type '<class 'numpy.float64'>'
 	with 412216 stored elements in Compressed Sparse Row format>)

In [0]:
model_full = Sequential()
model_full.add(Dense(16, activation='relu', input_dim=n_features))
model_full.add(Dropout(0.2))
model_full.add(Dense(1, activation='sigmoid'))
model_full.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

model_full.fit(x_full_train_tfidf, y, 
               batch_size=32,
               epochs=5,
              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc90f31cfd0>

## Сохранение сети

In [0]:
model_json = model_full.to_json()
json_file = open("rank_model.json", "w")
json_file.write(model_json)
json_file.close()

In [0]:
model_full.save_weights("rank_model.h5")
print("Сохранение сети завершено")

Сохранение сети завершено


## Создание посылки

In [0]:
p = model_full.predict_proba(x_full_test_tfidf)

In [0]:
prediction = pd.DataFrame()
prediction['index'] =  [i for i in list(test_df.index)]
# append a column with sentiment
prediction['score'] = 1-p
prediction.head(10)

Unnamed: 0,index,score
0,0,0.000196
1,1,0.15971
2,2,0.01349
3,3,0.002349
4,4,0.097279
5,5,0.000635
6,6,0.244834
7,7,0.945659
8,8,0.000521
9,9,0.016909


In [0]:
# write finilased results into csv
prediction.to_csv('prediction_rank.csv', index = False, sep = ',')