In [18]:
import pandas as pd
import numpy as np 

import gc 
import pickle
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from avito_functions import preprocessing

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

In [13]:
# input 

df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [14]:
df = pd.concat([df_train, df_test])
df.index = np.arange(X.shape[0])

del df_train, df_test
gc.collect()

df.shape

(2011862, 18)

In [15]:
# param
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features
df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

In [64]:
textfeats = ["description","text_feat", "title"]
russian_stop = set(stopwords.words('russian'))

df = df[textfeats]

In [65]:
# feature engineering 
def number_of_caps(s):
    return len([c for c in s if c.isupper()])

def number_of_excl(s):
    return len([c for c in s if c=='!'])

for cols in textfeats:
    print(cols)
        
    df[cols] = df[cols].astype(str)
    df[cols] = df[cols].str.replace('nan', '')
    df[cols] = df[cols].fillna('nicapotato')
    #df[cols + '_num_caps'] = df[cols].apply(number_of_caps)
    df[cols] = df[cols].str.lower()
    
    df[cols + '_num_excl'] = df[cols].apply(number_of_excl)
    df[cols + '_num_chars'] = df[cols].apply(len) 
    #df[cols + '_num_caps_frac'] = df[cols + '_num_caps'] / df[cols + '_num_chars'] * 100
    df[cols + '_num_excl_frac'] = df[cols + '_num_excl'] / df[cols + '_num_chars'] * 100
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split()))
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100


description
text_feat
title


In [74]:
with open('../input/text_num_features.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, 3:], file=f)

In [75]:
with open('../input/text_features.pkl', 'wb') as f: pickle.dump(obj=df.iloc[:, :3], file=f)

In [None]:

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}

def get_col(col_name): return lambda x: x[col_name]

vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=16000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()
vectorizer.fit(df.loc[traindex,:].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))


In [None]:
print("Modeling Stage")
# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack([csr_matrix(df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]]) # Sparse Matrix
testing = hstack([csr_matrix(df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
for shape in [X,testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))
del df
gc.collect();
