In [None]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns                                                                                                                                                                                                                                                                                                                                                                                                        
#from matplotlib_venn import venn2, venn2_circles
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import KFold


sns.set()
%matplotlib inline

In [None]:
gp = pd.read_csv('aggregated_features.csv')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.merge(gp, on='user_id', how='left')
test = test.merge(gp, on='user_id', how='left')

agg_cols = list(gp.columns)[1:]

del gp
gc.collect()

train.head()

In [None]:
city_region_unique = pd.read_csv("avito_region_city_features.csv")
city_region_unique.drop('city_region', axis=1, inplace=True)

In [None]:
train = train.merge(city_region_unique, how="left", on=["region", "city"])
test = test.merge(city_region_unique, how="left", on=["region", "city"])


In [None]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])


for df in [train, test]:
    df['description'].fillna('unknowndescription', inplace=True)
    df['title'].fillna('unknowntitle', inplace=True)

    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day
    
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split()))
        df['num_unique_words_' + col] = df[col].apply(lambda comment: len(set(w for w in comment.split())))

    df['words_vs_unique_title'] = df['num_unique_words_title'] / df['num_words_title'] * 100
    df['words_vs_unique_description'] = df['num_unique_words_description'] / df['num_words_description'] * 100
    
    df['city'] = df['region'] + '_' + df['city']
    df['num_desc_punct'] = df['description'].apply(lambda x: count(x, set(string.punctuation)))
    df["price"] = np.log(df["price"]+0.001)
    df["price"].fillna(-999,inplace=True)
    df["image_top_1"].fillna(-999,inplace=True)
    df["category_parent_name"] = df["category_name"] + ' ' + df['parent_category_name']
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)

In [None]:
count_vectorizer_title = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=25)

title_counts = count_vectorizer_title.fit_transform(train['title'].append(test['title']))

train_title_counts = title_counts[:len(train)]
test_title_counts = title_counts[len(train):]


count_vectorizer_desc = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                        lowercase=True, ngram_range=(1, 2),
                                        max_features=15000)

desc_counts = count_vectorizer_desc.fit_transform(train['description'].append(test['description']))

train_desc_counts = desc_counts[:len(train)]
test_desc_counts = desc_counts[len(train):]

train_title_counts.shape, train_desc_counts.shape

In [None]:
count_vectorizer_eng_title = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True, min_df=25)

title_counts = count_vectorizer_eng_title.fit_transform(train_eng['en_title'].append(test_eng['en_title']))

train_eng_title_counts = title_counts[:len(train_eng)]
test_eng_title_counts = title_counts[len(train_eng):]


count_vectorizer_eng_desc = TfidfVectorizer(stop_words=stopwords.words('english'), 
                                        lowercase=True, ngram_range=(1, 2),
                                        max_features=15000)

desc_counts = count_vectorizer_eng_desc.fit_transform(train_eng['en_desc'].append(test_eng['en_desc']))

train_eng_desc_counts = desc_counts[:len(train_eng)]
test_eng_desc_counts = desc_counts[len(train_eng):]

train_eng_title_counts.shape, train_eng_desc_counts.shape

In [None]:
target = 'deal_probability'
predictors = [
    'num_desc_punct', 
    'words_vs_unique_description', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title',
    'avg_times_up_user', 'avg_days_up_user', 'n_user_items', 
    'price', 'item_seq_number','num'
]
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

predictors = predictors + categorical

In [None]:
del train_eng, test_eng
gc.collect()

In [None]:
def to_categorical_idx(col, df_trn, df_test, drop_uniques=0):
    merged = pd.concat([df_trn[col], df_test[col]])
    if drop_uniques != 0:
        unique, inverse, counts = np.unique(merged, return_counts=True, return_inverse=True)
        unique_with_zeros = np.select([counts < drop_uniques, counts >= drop_uniques], [unique * 0, unique])
        merged = unique_with_zeros[inverse]

    train_size = df_trn[col].shape[0]
    idxs, uniques = pd.factorize(merged)
    
    return idxs[:train_size], idxs[train_size:], uniques
tr_userid, te_userid, tknzr_userid = to_categorical_idx('user_id', train, test, drop_uniques=16)

In [None]:
train['trans_user_id'] = tr_userid
test['trans_user_id'] =te_userid
predictors.append('trans_user_id')
del tr_userid, te_userid, tknzr_userid
gc.collect()

In [None]:
c_feats = ['latitude', 'longitude',
       'lat_lon_hdbscan_cluster_05_03', 'lat_lon_hdbscan_cluster_10_03',
       'lat_lon_hdbscan_cluster_20_03', 'region_id', 'city_region_id']
for cf in c_feats:
    predictors.append(cf)

In [None]:
cc_feats =['lat_lon_hdbscan_cluster_05_03', 'lat_lon_hdbscan_cluster_10_03',
       'lat_lon_hdbscan_cluster_20_03']
for ccf in cc_feats:
    categorical.append(ccf)

In [None]:
income = pd.read_csv('region_income.csv')
train = train.merge(income, on='region', how='left')
test = test.merge(income, on='region', how='left')
predictors.append('income')
city_pop = pd.read_csv('city_population.csv')
gp = city_pop.groupby(['city'])[['population']]
gp_df = pd.DataFrame()
gp_df['population'] = gp.sum()['population']
gp_df.reset_index(inplace=True)
gp_df.rename(index=str, columns={'index': 'city'})
train = train.merge(gp_df, on='city', how='left')
test = test.merge(gp_df, on='city', how='left')
predictors.append('population')

In [None]:
region_map = {"Свердловская область" : "Sverdlovsk oblast",
            "Самарская область" : "Samara oblast",
            "Ростовская область" : "Rostov oblast",
            "Татарстан" : "Tatarstan",
            "Волгоградская область" : "Volgograd oblast",
            "Нижегородская область" : "Nizhny Novgorod oblast",
            "Пермский край" : "Perm Krai",
            "Оренбургская область" : "Orenburg oblast",
            "Ханты-Мансийский АО" : "Khanty-Mansi Autonomous Okrug",
            "Тюменская область" : "Tyumen oblast",
            "Башкортостан" : "Bashkortostan",
            "Краснодарский край" : "Krasnodar Krai",
            "Новосибирская область" : "Novosibirsk oblast",
            "Омская область" : "Omsk oblast",
            "Белгородская область" : "Belgorod oblast",
            "Челябинская область" : "Chelyabinsk oblast",
            "Воронежская область" : "Voronezh oblast",
            "Кемеровская область" : "Kemerovo oblast",
            "Саратовская область" : "Saratov oblast",
            "Владимирская область" : "Vladimir oblast",
            "Калининградская область" : "Kaliningrad oblast",
            "Красноярский край" : "Krasnoyarsk Krai",
            "Ярославская область" : "Yaroslavl oblast",
            "Удмуртия" : "Udmurtia",
            "Алтайский край" : "Altai Krai",
            "Иркутская область" : "Irkutsk oblast",
            "Ставропольский край" : "Stavropol Krai",
            "Тульская область" : "Tula oblast"}
regional = pd.read_csv("regional.csv", index_col = [0])
train['region_en'] = train['region'].apply(lambda x : region_map[x])
test['region_en'] = test['region'].apply(lambda x : region_map[x])

In [None]:
rDense = regional["Density_of_region(km2)"]
rRural = regional["Rural_%"]
rTime_zone = regional["Time_zone"]
rPopulation = regional["Total_population"]
rUrban = regional["Urban%"]
reg_index = np.array([regional.index[i].lower() for i in range(len(regional))])
rDense.index = reg_index
rRural.index = reg_index
rTime_zone.index = reg_index
rPopulation.index = reg_index
rUrban.index = reg_index

df_region = train["region_en"]

reg_dense = np.array([rDense[df_region[i].lower()] for i in range(len(train))])
reg_rural = np.array([rRural[df_region[i].lower()] for i in range(len(train))])
reg_Time_zone = np.array([rTime_zone[df_region[i].lower()] for i in range(len(train))])
reg_Population = np.array([rPopulation[df_region[i].lower()] for i in range(len(train))])
reg_Urban = np.array([rUrban[df_region[i].lower()] for i in range(len(train))])

train["reg_dense"] = reg_dense
train["rural"] = reg_rural
train["reg_Time_zone"] = reg_Time_zone
train["reg_Population"] = reg_Population
train["reg_Urban"] = reg_Urban

reg_dense = np.array([rDense[df_region[i].lower()] for i in range(len(test))])
reg_rural = np.array([rRural[df_region[i].lower()] for i in range(len(test))])
reg_Time_zone = np.array([rTime_zone[df_region[i].lower()] for i in range(len(test))])
reg_Population = np.array([rPopulation[df_region[i].lower()] for i in range(len(test))])
reg_Urban = np.array([rUrban[df_region[i].lower()] for i in range(len(test))])

test["reg_dense"] = reg_dense
test["rural"] = reg_rural
test["reg_Time_zone"] = reg_Time_zone
test["reg_Population"] = reg_Population
test["reg_Urban"] = reg_Urban

In [None]:
del reg_dense,reg_rural,reg_Time_zone,reg_Population,reg_Urban,rDense,rRural,rTime_zone
del rPopulation,rUrban,reg_index,regional
gc.collect()

In [None]:
demo_feats = ['reg_dense','rural','reg_Time_zone','reg_Population','reg_Urban']
for demf in demo_feats:
    predictors.append(demf)

In [None]:
categorical.append('reg_Time_zone')

In [None]:
train['reg_Urban'] = train['reg_Urban'].apply(lambda x: x/100)
test['reg_Urban'] = test['reg_Urban'].apply(lambda x: x/100)
train['rural'] = train['rural'].apply(lambda x: x/100)
test['rural'] = test['rural'].apply(lambda x: x/100)

In [None]:
get_feat_col = ['category_name','parent_category_name', 'city','region', 'user_type','param_1','param_2']
for gfc in get_feat_col:
    use_colm = ['mean','max','std','min']
    use_colm.append(gfc)
    gp = pd.read_csv(gfc + "_per_day_stats.csv", usecols=use_colm)
    gp.rename(columns={"mean":gfc+"_mean",
                       "std" :gfc+"_std",
                       "max" :gfc+"_max",
                       "min" :gfc+"_min"},inplace=True)
    train = train.merge(gp, on=gfc, how='left')
    test = test.merge(gp, on=gfc, how='left')
    use_colm.remove(gfc)
    for uc in use_colm:
        predictors.append(gfc+'_'+uc)

In [None]:
del gp
gc.collect()

In [None]:
train_tmp = pd.read_csv('train.csv')
train_active = pd.read_csv('train_active.csv')
test_tmp = pd.read_csv('test.csv')
test_active = pd.read_csv('test_active.csv')
all_samples = pd.concat([
    train_tmp,
    train_active,
    test_tmp,
    test_active
]).reset_index(drop=True)
all_samples.drop_duplicates(['item_id'], inplace=True)
del train_active, train_tmp
del test_active, test_tmp
gc.collect()

In [None]:
all_samples = all_samples.merge(city_region_unique, how="left", on=["region", "city"])
del city_region_unique
gc.collect()

In [None]:
categoricals = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]
for cate in categoricals:
    all_samples[cate] = all_samples[cate].fillna('unknown')
    train[cate] = train[cate].fillna('unknown')
    test[cate] = test[cate].fillna('unknown')

In [None]:
all_samples['filtered_price'] = all_samples['price'].apply(lambda x: 0 if x<0 else x)
all_samples['filtered_price'] = all_samples['filtered_price'].apply(lambda x: 999999 if x>=1000000 else x)
all_samples['filtered_price'] = all_samples['filtered_price'].apply(lambda x: 0 if x<0 else x)
all_samples["filtered_price"] = np.log(all_samples["filtered_price"]+0.001)
all_samples['norm_price'] = all_samples['filtered_price'] / all_samples.groupby('param_2')['filtered_price'].transform('sum')
all_samples['bin'] = pd.cut(all_samples['norm_price'], np.linspace(0.0,1.0, num=50))
all_samples['bin_2'] = pd.cut(all_samples['norm_price'], np.linspace(0.0,1.0, num=1000))
all_samples['bin_3'] = pd.cut(all_samples['norm_price'], np.linspace(0.0,1.0, num=10000))
gps = all_samples.groupby(['bin_3','param_2'])['item_id'].count().reset_index().rename(columns={'item_id':'price_bin3_count'})
all_samples = all_samples.merge(gps, on=['bin_3','param_2'], how='left')
gps = all_samples.groupby(['bin_2','param_2'])['item_id'].count().reset_index().rename(columns={'item_id':'price_bin2_count'})
all_samples = all_samples.merge(gps, on=['bin_2','param_2'], how='left')
gps = all_samples.groupby(['bin','param_2'])['item_id'].count().reset_index().rename(columns={'item_id':'price_bin_count'})
all_samples = all_samples.merge(gps, on=['bin','param_2'], how='left')


In [None]:
final = pd.read_csv('price_bin_count.csv')
    
train = train.merge(final, on='item_id', how='left')
test = test.merge(final, on='item_id', how='left')
del final 
gc.collect()

In [None]:
train['price_temp'] = train['price']
test['price_temp'] = test['price']
all_samples['price_temp'] = all_samples['price']

train['item_seq_number_temp'] = train['item_seq_number']
test['item_seq_number_temp'] = test['item_seq_number']
all_samples['item_seq_number_temp'] = all_samples['item_seq_number']

train['price'] = train['price'].astype('str')
test['price'] = test['price'].astype('str')
all_samples['price'] = all_samples['price'].astype('str')

train['item_seq_number'] = train['item_seq_number'].astype('str')
test['item_seq_number'] = test['item_seq_number'].astype('str')
all_samples['item_seq_number'] = all_samples['item_seq_number'].astype('str')

In [None]:
feats = ['param_1', 'param_2', 'param_3']
for ft in feats:
    train[ft] = train[ft].astype('str')
    test[ft] = test[ft].astype('str')
    all_samples[ft] = all_samples[ft].astype('str')

In [None]:
categoricals = [
    'image_top_1'
]
for feature in categoricals:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    encoder.fit(all_samples[feature].astype(str))
    
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))
train['image_top_1'] = train['image_top_1'].astype('str')
test['image_top_1'] = test['image_top_1'].astype('str')
all_samples['image_top_1'] = all_samples['image_top_1'].astype('str')

In [None]:
import os
empty = []
naddfeat= 59
for i in range(50,naddfeat):
    if i==0: selcols=['item_seq_number', 'param_1','param_2','param_3','item_id']; QQ=0;
    if i==1: selcols=['item_seq_number', 'param_3', 'category_name', 'parent_category_name','item_id']; QQ=0;
    if i==2: selcols=['region', 'city', 'category_name','bin_3','item_id']; QQ=0;
    if i==3: selcols=['price', 'param_3','item_id']; QQ=0;
    if i==4: selcols=['price','param_1','param_2','param_3', 'item_id']; QQ=0;
    if i==5: selcols=['price','category_name', 'parent_category_name', 'item_id']; QQ=0;
    if i==6: selcols=['price', 'city', 'region','item_id']; QQ=0;
    if i==7: selcols=['price', 'image_top_1','item_id']; QQ=0;
    if i==8: selcols=['price', 'bin_3','item_id']; QQ=0;
    if i==9: selcols=['price', 'bin_2','bin_3','category_name' ,'item_id']; QQ=0;
    if i==10: selcols=['image_top_1','bin_2','bin_3', 'item_id']; QQ=0;
    if i==11: selcols=['image_top_1', 'city', 'region','item_id']; QQ=0;
    if i==12: selcols=['image_top_1', 'category_name', 'parent_category_name','item_id']; QQ=0;
    if i==13: selcols=['image_top_1', 'category_name', 'parent_category_name','user_type','item_id']; QQ=0;
    if i==14: selcols=['image_top_1','price','user_type','item_id']; QQ=0;
    if i==15: selcols=['item_seq_number', 'param_1','param_2','param_3','item_id']; QQ=0;
    if i==16: selcols=['item_seq_number', 'param_3', 'category_name', 'parent_category_name','item_id']; QQ=0;
    if i==17: selcols=['region', 'city', 'category_name','bin_3']; QQ=4;
    if i==18: selcols=['price', 'param_3']; QQ=4;
    if i==19: selcols=['price','param_1','param_2','param_3']; QQ=4;
    if i==20: selcols=['price','category_name', 'parent_category_name']; QQ=4;
    if i==21: selcols=['price', 'city', 'region']; QQ=4;
    if i==22: selcols=['price', 'image_top_1']; QQ=4;
    if i==23: selcols=['price', 'bin_3']; QQ=4;
    if i==24: selcols=['price', 'bin_2','bin_3','category_name' ]; QQ=4;
    if i==25: selcols=['image_top_1','bin_2','bin_3']; QQ=4;
    if i==26: selcols=['image_top_1', 'city', 'region']; QQ=4;
    if i==27: selcols=['image_top_1', 'category_name', 'parent_category_name']; QQ=4;
    if i==28: selcols=['image_top_1', 'category_name', 'parent_category_name','user_type']; QQ=4;
    if i==29: selcols=['lat_lon_hdbscan_cluster_05_03','price','category_name']; QQ=4;
    if i==30: selcols=['lat_lon_hdbscan_cluster_10_03','price','category_name']; QQ=4;
    if i==31: selcols=['lat_lon_hdbscan_cluster_20_03','price','category_name']; QQ=4;
    if i==32: selcols=['lat_lon_hdbscan_cluster_05_03','bin_2','category_name','item_id']; QQ=0;
    if i==33: selcols=['lat_lon_hdbscan_cluster_10_03','bin_2','category_name','item_id']; QQ=0;
    if i==34: selcols=['lat_lon_hdbscan_cluster_20_03','bin_2','category_name','item_id']; QQ=0;
    if i==35: selcols=['lat_lon_hdbscan_cluster_05_03','bin','parent_category_name','item_id']; QQ=0;
    if i==36: selcols=['lat_lon_hdbscan_cluster_10_03','bin','parent_category_name','item_id']; QQ=0;
    if i==37: selcols=['lat_lon_hdbscan_cluster_20_03','bin','parent_category_name','item_id']; QQ=0;
    if i==38: selcols=['lat_lon_hdbscan_cluster_05_03','bin_3','image_top_1']; QQ=4;
    if i==39: selcols=['lat_lon_hdbscan_cluster_10_03','bin_3','image_top_1']; QQ=4;
    if i==40: selcols=['lat_lon_hdbscan_cluster_20_03','bin_3','image_top_1']; QQ=4;
    if i==41: selcols=['lat_lon_hdbscan_cluster_05_03','bin_3','image_top_1','item_id']; QQ=0;
    if i==42: selcols=['lat_lon_hdbscan_cluster_10_03','bin_3','image_top_1','item_id']; QQ=0;
    if i==43: selcols=['lat_lon_hdbscan_cluster_20_03','bin_3','image_top_1','item_id']; QQ=0;
    if i==44: selcols=['region', 'city', 'category_name','user_id']; QQ=4;
    if i==45: selcols=['price', 'param_3','user_id']; QQ=4;
    if i==46: selcols=['price','param_1','param_2','param_3','user_id']; QQ=4;
    if i==47: selcols=['price','category_name', 'parent_category_name','user_id']; QQ=4;
    if i==48: selcols=['price', 'lat_lon_hdbscan_cluster_05_03','user_id']; QQ=4;
    if i==49: selcols=['price', 'image_top_1','user_id']; QQ=4;
    if i==50: selcols=['price', 'bin_3','user_id']; QQ=4;
    if i==51: selcols=['price', 'bin_2','bin_3','category_name','user_id']; QQ=4;
    if i==52: selcols=['image_top_1','bin_2','bin_3','user_id']; QQ=4;
    if i==53: selcols=['image_top_1', 'city', 'region','user_id']; QQ=4;
    if i==54: selcols=['image_top_1', 'category_name', 'parent_category_name','user_id']; QQ=4;
    if i==55: selcols=['image_top_1', 'category_name', 'user_id']; QQ=4;
    if i==56: selcols=['lat_lon_hdbscan_cluster_05_03','bin_2','item_id']; QQ=0;
    if i==57: selcols=['lat_lon_hdbscan_cluster_10_03','bin_2','item_id']; QQ=0;
    if i==58: selcols=['lat_lon_hdbscan_cluster_20_03','bin_2','item_id']; QQ=0;


    print('selcols',selcols,'QQ',QQ)
    filename = '_'.join(selcols) + '.csv'
    if os.path.exists(filename):
        print('here')
        gp=pd.read_csv(filename)
        train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
    else:
        if QQ==0:
            gp = all_samples[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].count().reset_index().\
                rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            if(gp.empty==False):
                
                train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
                test = test.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            else:
                empty.append('X'+str(i))
        if QQ==1:
            gp = all_samples[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].mean().reset_index().\
                rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            test = test.merge(gp, on=selcols[0:len(selcols)-1], how='left')
        if QQ==2:
            gp = all_samples[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].var().reset_index().\
                rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            test = test.merge(gp, on=selcols[0:len(selcols)-1], how='left')
        if QQ==3:
            gp = all_samples[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].skew().reset_index().\
                rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            test = test.merge(gp, on=selcols[0:len(selcols)-1], how='left')
        if QQ==4:
            gp = all_samples[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].nunique().reset_index().\
                rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
            if(gp.empty==False):
                   
                train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
                test = test.merge(gp, on=selcols[0:len(selcols)-1], how='left')
            else:
                empty.append('X'+str(i))
      
            
        if (os.path.exists(filename)==False):
            if(gp.empty==False):
                gp.to_csv(filename,index=False)
            
    del gp
    gc.collect()

In [None]:
for i in range(0,naddfeat):
    predictors.append('X'+str(i))


In [None]:
train['item_seq_number'] = train['item_seq_number_temp'] 
test['item_seq_number'] = test['item_seq_number_temp'] 
#all_samples['item_seq_number'] = all_samples['item_seq_number_temp']
train['price'] = train['price_temp']
test['price'] = test['price_temp'] 
#all_samples['price'] = all_samples['price_temp'] 

In [None]:
empty = [ 'X7', 'X10', 'X11', 'X12','X2','X17','X44']
for i in empty:
    predictors.remove(i)
del all_samples
gc.collect()

In [None]:
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type', 'reg_Time_zone','bin','bin_2','bin_3'
]
for feature in categorical:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    encoder.fit(train[feature].append(test[feature]).astype(str))
    
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))

In [None]:
features = pd.read_csv('intermediate.csv')
test_features = pd.read_csv('test_intermediate.csv')

In [None]:
image_feat = ['nima_mean','nima_std','mobilenet_mean','mobilenet_std']
for img_feat in image_feat:
    train[img_feat] = features[img_feat]
    test[img_feat] = test_features[img_feat]
    predictors.append(img_feat)

In [None]:
image_feat = ['imagenet_conf','imagenet_class']
for img_feat in image_feat:
    train[img_feat] = features[img_feat]
    test[img_feat] = test_features[img_feat]
    predictors.append(img_feat)
categorical.append('imagenet_class')

In [None]:
image_feat = ['dullness','whiteness','average_pixel_width','average_red',
'average_green','average_blue','image_size','blurrness', 'width','height']
for img_feat in image_feat:
    train[img_feat] = features[img_feat]
    test[img_feat] = test_features[img_feat]
    predictors.append(img_feat)

In [None]:
image_top_train = pd.read_csv("train_image_top_1_features.csv") 
image_top_test = pd.read_csv("test_image_top_1_features.csv")

In [None]:
train['image_top_1'] = image_top_train['image_top_1']
test['image_top_1'] = image_top_test['image_top_1']
categoricals = [
    'image_top_1'
]
for feature in categoricals:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    encoder.fit(train[feature].append(test[feature]).astype(str))
    
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
### TFIDF Vectorizer ###
tfidf_vec = TfidfVectorizer(ngram_range=(1,1))
#ngram_range defines how you want to have words in your dictionary. (min,max) = (1,2) will mean you will have unigrams and bigrms in your vocabulary. 
#Example String: "The old fox"
#Vocabulary: "The", "old", "fox", "The old", "old fox"

full_tfidf = tfidf_vec.fit_transform(train['title'].values.tolist() + test['title'].values.tolist())
#train_df['title'].values.tolist() this converts all the values in the title column into a list. '+' appends two lists

train_tfidf = tfidf_vec.transform(train['title'].values.tolist())
test_tfidf = tfidf_vec.transform(test['title'].values.tolist())

In [None]:
### SVD Components ###
n_comp = 5
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
train_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
test_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
train = pd.concat([train, train_svd], axis=1)
test = pd.concat([test, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd
for i in range(n_comp):
    predictors.append('svd_title_'+str(i+1))

In [None]:
#apps_as_matrix = CountTokenizer().fit_transform(apps_as_sentence)
full_tfidf = tfidf_vec.fit_transform(train['title'].values.tolist() + test['title'].values.tolist())
#train_df['title'].values.tolist() this converts all the values in the title column into a list. '+' appends two lists

train_tfidf = tfidf_vec.transform(train['title'].values.tolist())
test_tfidf = tfidf_vec.transform(test['title'].values.tolist())



In [None]:
from sklearn.decomposition import LatentDirichletAllocation
n_comp=5
lda_obj = LatentDirichletAllocation(n_components=n_comp)
lda_obj.fit(full_tfidf)
train_lda = pd.DataFrame(lda_obj.transform(train_tfidf))
test_lda = pd.DataFrame(lda_obj.transform(test_tfidf))
train_lda.columns = ['lda_title_'+str(i+1) for i in range(n_comp)]
test_lda.columns = ['lda_title_'+str(i+1) for i in range(n_comp)]
train = pd.concat([train, train_lda], axis=1)
test = pd.concat([test, test_lda], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_lda, test_lda
for i in range(n_comp):
    predictors.append('lda_title_'+str(i+1))

In [None]:
    
tfidf_vec = TfidfVectorizer(ngram_range=(1,1), max_features=100000)
full_tfidf = tfidf_vec.fit_transform(train['description'].values.tolist() + test['description'].values.tolist())
train_tfidf = tfidf_vec.transform(train['description'].values.tolist())
test_tfidf = tfidf_vec.transform(test['description'].values.tolist())

### SVD Components ###
n_comp = 5
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
train_svd.columns = ['svd_desc_'+str(i+1) for i in range(n_comp)]
test_svd.columns = ['svd_desc_'+str(i+1) for i in range(n_comp)]
train = pd.concat([train, train_svd], axis=1)
test = pd.concat([test, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd
gc.collect()    
for i in range(n_comp):
    predictors.append('svd_desc_'+str(i+1))

In [None]:
class TargetEncoder:
    # Adapted from https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
    def __repr__(self):
        return 'TargetEncoder'

    def __init__(self, cols, smoothing=1, min_samples_leaf=1, noise_level=0, keep_original=False):
        self.cols = cols
        self.smoothing = smoothing
        self.min_samples_leaf = min_samples_leaf
        self.noise_level = noise_level
        self.keep_original = keep_original

    @staticmethod
    def add_noise(series, noise_level):
        return series * (1 + noise_level * np.random.randn(len(series)))

    def encode(self, train, test, target):
        for col in self.cols:
            if self.keep_original:
                train[col + '_te'], test[col + '_te'] = self.encode_column(train[col], test[col], target)
            else:
                train[col], test[col] = self.encode_column(train[col], test[col], target)
        return train, test

    def encode_column(self, trn_series, tst_series, target):
        temp = pd.concat([trn_series, target], axis=1)
        # Compute target mean
        averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
        # Compute smoothing
        smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))
        # Apply average function to all target data
        prior = target.mean()
        # The bigger the count the less full_avg is taken into account
        averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
        averages.drop(['mean', 'count'], axis=1, inplace=True)
        # Apply averages to trn and tst series
        ft_trn_series = pd.merge(
            trn_series.to_frame(trn_series.name),
            averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
            on=trn_series.name,
            how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
        # pd.merge does not keep the index so restore it
        ft_trn_series.index = trn_series.index
        ft_tst_series = pd.merge(
            tst_series.to_frame(tst_series.name),
            averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
            on=tst_series.name,
            how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
        # pd.merge does not keep the index so restore it
        ft_tst_series.index = tst_series.index
        return self.add_noise(ft_trn_series, self.noise_level), self.add_noise(ft_tst_series, self.noise_level)

In [None]:
f_cats = ['region','city','parent_category_name','category_name','user_type','image_top_1']
target_encode = TargetEncoder(min_samples_leaf=100, smoothing=7, noise_level=0.01, keep_original=True, cols=f_cats)
train, test = target_encode.encode(train, test, train['deal_probability'])
for fc in f_cats:
    predictors.append(fc+'_te')

In [None]:
price_feat = ['norm_price','bin','bin_2','price_bin_count','price_bin2_count','bin_3','price_bin3_count']
for pf in price_feat:
    predictors.append(pf)

In [None]:
f_cats = ['bin','bin_2','bin_3']
for fc in f_cats:
    categorical.append(fc)

In [None]:
f_cats = ['bin','bin_2','bin_3','param_2','param_1']
target_encode = TargetEncoder(min_samples_leaf=100, smoothing=10, noise_level=0.01, keep_original=True, cols=f_cats)
train, test = target_encode.encode(train, test, train['deal_probability'])
for fc in f_cats:
    predictors.append(fc+'_te')

In [None]:
train["price"].fillna(-999,inplace=True)
train["image_top_1"].fillna(-999,inplace=True)

#test["price"] = np.log(test["price"]+0.001)
test["price"].fillna(-999,inplace=True)
test["image_top_1"].fillna(-999,inplace=True)

In [None]:
rounds = 30000
early_stop_rounds = 200

feature_names = np.hstack([
    count_vectorizer_desc.get_feature_names(),
    count_vectorizer_title.get_feature_names(),
    count_vectorizer_eng_desc.get_feature_names(),
    count_vectorizer_eng_title.get_feature_names(),
    predictors,
    #tfvocab
])
print('Number of features:', len(feature_names))

In [None]:
x_test = scipy.sparse.hstack([
    test_desc_counts,
    test_title_counts,
    test_eng_desc_counts,
    test_eng_title_counts,
    test[predictors],
    #test_ready_df
], format='csr')

In [None]:
x_train = scipy.sparse.hstack([
        train_desc_counts,
        train_title_counts,
        train_eng_desc_counts,
        train_eng_title_counts,
        train.loc[:,predictors],
       # train_ready_df
      #  train_ready_df.loc[train_index,:].values
], format='csr')
y_train = train[target]


In [None]:
params = {
    'eta' :0.3,
    'tree_method':"hist",
    'grow_policy': "lossguide",
    'max_leaves': 1400,
    'max_depth' : 0 ,
    'subsample' : 0.9,
    'colsample_bytree': 0.7,
    'colsample_bylevel':0.7,
    'min_child_weight':0,
    'alpha':4,
    'objective': 'reg:logistic',
    'eval_metric': 'rmse',
    'random_state':99,
    'silent':True
}

In [None]:
import xgboost as xgb
from xgboost import plot_importance

RS = 1234921940
folds = KFold(n_splits=10, shuffle=True, random_state=1020210)
oof_preds = np.zeros(x_train.shape[0])

test_predicts_list = []
np.random.seed(RS)
te_data = xgb.DMatrix(x_test)
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x_train)):
    trn_x, trn_y = x_train[trn_idx], y_train[trn_idx].values
    val_x, val_y = x_train[val_idx], y_train[val_idx].values
    
    tr_data = xgb.DMatrix(trn_x, label=trn_y)
    va_data = xgb.DMatrix(val_x, label=val_y)
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    model = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds=50, verbose_eval=5)
    
    preds = model.predict(te_data)
    oof_preds[val_idx] = model.predict(val_x)
    test_predicts_list.append(preds)
    fig, ax = plt.subplots(figsize=(10, 14))
    plot_importance(model)
    plt.title("Light GBM Feature Importance")
    plt.show()

In [None]:
test_predicts = np.ones(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts *= fold_predict

test_predicts **= (1. / len(test_predicts_list))

In [None]:
subm = pd.read_csv('sample_submission.csv', usecols=['item_id'])
subm['deal_probability'] = np.clip(final_preds, 0, 1)
subm.to_csv('xgb_submission.csv', index=False)

In [None]:
np.save('xgb_cv_oof.npy',oof_preds)
np.save('xgb_cv_preds.npy',final_preds)