In [2]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV 
from sklearn.grid_search import RandomizedSearchCV 

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold

from sklearn.linear_model import LogisticRegression
import pymorphy2
import re

In [3]:
train_all = pd.read_csv("./data/train.csv")
train_all['description'] = train_all['description'].fillna("")
train_all['title'] = train_all['title'].fillna("")

In [4]:
train_all.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [5]:
train_all['deal_class'] = train_all['deal_probability'].apply(lambda x: "1" if x >=0.5 else "0")
train_all['text'] = train_all['title'] + " " + train_all['description']
data = train_all[['text', 'deal_class']]

In [6]:
data.head()

Unnamed: 0,text,deal_class
0,"Кокоби(кокон для сна) Кокон для сна малыша,пол...",0
1,"Стойка для Одежды Стойка для одежды, под вешал...",0
2,"Philips bluray В хорошем состоянии, домашний к...",0
3,Автокресло Продам кресло от0-25кг,1
4,"ВАЗ 2110, 2003 Все вопросы по телефону.",0


In [7]:
data.deal_class.value_counts()

0    1321411
1     182013
Name: deal_class, dtype: int64

In [8]:
# def clean_str(string):
#     morph = pymorphy2.MorphAnalyzer()
    
#     symbols = [
#         ',', '.', '-', '*', '#', ')', '(', '/', '<', '>', ':', '+', '?', '!', '"', '"', '%', '=', '\\', '}'
#     ]
    
#     for symbol in symbols:
#         string = str(string).replace(symbol, ' ')
        
#     words = string.split()
#     normalized_words = []
    
#     for word in words:
#         normalized_words.append(morph.parse(word.strip())[0].normal_form)
        
#     string = ' '.join(normalized_words)
    
#     return string

# data['text'] = data.text.apply(clean_str)

In [9]:
cv = StratifiedKFold(data.deal_class, n_folds=3, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word')),
    ('classifier', LogisticRegression())
])

pipeline_params = {
    'classifier__penalty': ['l2'],
    'vectorizer__ngram_range': [(1, 3)]
}

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)

grid.fit(data.text.values, data.deal_class)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + LogisticRegression): {}, params {}" . format(grid.best_score_, grid.best_params_)
)
grid.best_score_

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed: 30.7min finished


Accuracy (TfidfVectorizer + LogisticRegression): 0.8802739613043293, params {'classifier__penalty': 'l2', 'vectorizer__ngram_range': (1, 3)}


0.8802739613043293

In [10]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), analyzer='word' #, stop_words=stopwords.words('russian'),norm='l2'
)
X_vect = vectorizer.fit_transform(data.text.values)

classifier = LogisticRegression(penalty='l2')
fitted = classifier.fit(X_vect, data.deal_class)

In [11]:
import pickle
with open('text_fitted.pkl', 'wb') as f:
    pickle.dump(fitted, f)

with open('text_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [12]:
import pickle

with open('text_fitted.pkl', 'rb') as input_file:
    fitted = pickle.load(input_file)
    
with open('text_vectorizer.pkl', 'rb') as input_file:
    vectorizer = pickle.load(input_file)

In [15]:
X_test = vectorizer.transform(data.text.values)
probs = fitted.predict(X_test)

In [79]:
probs

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [17]:
# from tqdm import tqdm
from sklearn.metrics import accuracy_score
prod_real = data.deal_class.values

accuracy_score(prod_real, probs)

0.89536817291728743

In [21]:
text_classes = pd.DataFrame(data = probs, index=None)
text_classes.columns = ['text_class']

In [23]:
text_classes.head()

Unnamed: 0,text_class
0,0
1,0
2,0
3,0
4,0


In [24]:
text_classes.to_csv('./data/text_classes_train.csv')

In [13]:
test_all = pd.read_csv("./data/test.csv")
test_all['description'] = train_all['description'].fillna("")
test_all['title'] = train_all['title'].fillna("")
test_all['text'] = test_all['title'] + " " + test_all['description']

X_test = vectorizer.transform(test_all.text.values)
probs = fitted.predict(X_test)

In [14]:
len(probs)

508438

In [15]:
text_classes = pd.DataFrame(data = probs, index=None)
text_classes.columns = ['text_class']
text_classes.to_csv('./data/text_classes_test.csv')