In [1]:
import pandas as pd
import numpy as np 

import re
import gc 
import pickle
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from avito_functions import preprocessing

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

import time 
import pymorphy2

In [23]:
abbr_repl = {"эт.": "этаж", 
             "сот.": "сотка",
             "кг.": "килограмм",
             "р.": "размер",
             "арт.": "артикул",
             "art.": "артикул",
             "нат.": "натуральная",
             "натур.": "натуральная",
             "г.": "город",
             "лада": "lada",
            }

def clean(d):

    d = str(d).lower()
    d = d.replace('nan', '')
    # автозамена сокращений 
    for a, r in abbr_repl.items():
        d = d.replace(a, ' ' + r + ' ')
    # дроби
    d = re.sub(r"([0-9])[^0-9]([0-9])", r"\1x\2", d)
    # дюймы 
    d = re.sub(r"([0-9])[\'\"\*]", r"\1in", d)
    # удаляем пунктуацию кроме -
    d = re.sub(r"[^\w\s\-]", r" ", d)
    # удаляем - в начале и конце слов
    d = re.sub(r"(\s)[\.-](\w)", r"\1\2", d)
    d = re.sub(r"(\w)[\.-](\s)", r"\1\2", d)
    # удаляем несколько пробелов подряд 
    d = re.sub(r"( )+", r" ", d).strip()
    
    # лемматизация 
    d = ' '.join([ma.parse(w)[0].normal_form for w in d.split()])
    return d

ma = pymorphy2.MorphAnalyzer()

In [6]:
# input 

df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [7]:
# with open('../desc.txt', 'w', encoding='utf-8') as f:
#     for t in df_train.description.head(50000):
#         f.write(str(t).replace('\n', ' ')+'\n')

# with open('../params.txt', 'w', encoding='utf-8') as f:
#     for _, p1, p2, p3 in df_train[['param_1', 'param_2', 'param_3']].head(50000).itertuples():
#         f.write(' -- '.join(map(str, [p1,p2,p3])) + '\n' )
        
# with open('../title.txt', 'w', encoding='utf-8') as f:
#     for t in df_train.title.head(50000).apply(clean):
#         f.write(str(t)+'\n')

In [8]:
n_train = df_train.shape[0]

df = pd.concat([df_train, df_test])
df.index = np.arange(df.shape[0])

del df_train, df_test
gc.collect()

41

In [18]:
# param

print('preproc param..')
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features
df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

textfeats = ["description", "text_feat", "title"]
df = df[textfeats]

for cols in textfeats:
    print(cols)
    df[cols] = df[cols].astype(str)
    df[cols] = df[cols].apply(clean)
    df[cols + '_num_chars'] = df[cols].apply(len)
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split()))
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100
    
# with open('../input/text_num_features_clean.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, 3:], file=f)
# with open('../input/text_features_clean.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, :3], file=f)
    
with open('../input/text_num_features_lemm.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, 3:], file=f)
with open('../input/text_features_lemm.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, :3], file=f)

preproc param..
description
text_feat
title


In [None]:
### TSVD tfidf 1

from sklearn.decomposition import TruncatedSVD

def train_tsvd(n, tfidf_dict):
    print('-- tSVD:', n)
    ret = {}
    tsvd = TruncatedSVD(n_components=n, random_state=2018)
    ret['train'] = tsvd.fit_transform(tfidf_dict['train'])
    ret['valid'] = tsvd.transform(tfidf_dict['valid'])
    ret['holdout'] = tsvd.transform(tfidf_dict['holdout'])    
    ret['fulltrain'] = tsvd.fit_transform(tfidf_dict['fulltrain'])
    ret['test'] = tsvd.transform(tfidf_dict['test'])
    with open('../fe/tfidf_svd' + str(n) + '.pkl', 'wb') as file: pickle.dump(file=file, obj=ret)
    return ret

with open('../input/tfidf_1.pkl', 'rb') as f: 
    tfidf_dict = pickle.load(f)

n = 20
fe_tfidf_svd = train_tsvd(n, tfidf_dict)