In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
import lightgbm as lgb
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.decomposition import TruncatedSVD

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('[^a-zA-Z]',' ', review_text)
    review_text = re.sub('\s+',' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')

train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

In [None]:
FeatureNames = FeatureNames['x'].values.tolist()

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,True))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,True))

train['Title2'] = train['Title'].apply(lambda x: cleaning_text(x,True,True))
test['Title2'] = test['Title'].apply(lambda x: cleaning_text(x,True,True))

In [None]:
%%time
vectorizer1 = HashingVectorizer(n_features = 2**14, ngram_range=(1,3))
vectorizer1.fit(train['Synopsis2'])

In [None]:
%%time
vectorizer2 = HashingVectorizer(n_features = 2**14, ngram_range=(1,3))
vectorizer2.fit(train['Title2'])

In [None]:
train[['Title','Title2']].head()

In [None]:
%%time
test_bow_tf = vectorizer1.transform(test['Synopsis2'])
test_tfidf_tf = vectorizer2.transform(test['Title2'])
test_tf = sparse.hstack((test_bow_tf,test_tfidf_tf,sparse.csr_matrix(np.asmatrix(test[FeatureNames].values))))
print(test_tf.shape)

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]

In [None]:
param = {
    'learning_rate': 0.01,
    'max_depth': 20,
    'min_data_in_leaf': 3,
    'bagging_freq': 5,
    'bagging_fraction': 0.95,
    'feature_fraction': 0.1,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 392
}

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_bow_tf = vectorizer1.transform(temp_train['Synopsis2'])
    temp_val_bow_tf = vectorizer1.transform(temp_val['Synopsis2'])
    
    temp_train_tfidf_tf = vectorizer2.transform(temp_train['Title2'])
    temp_val_tfidf_tf = vectorizer2.transform(temp_val['Title2'])
    
    temp_train_tf = sparse.hstack((temp_train_bow_tf,temp_train_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))
    temp_val_tf = sparse.hstack((temp_val_bow_tf,temp_val_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))
    
    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test_tf, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test_tf, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB'] = sub_data_preds
test['Price_Log_Pred_LGB'].describe()

In [None]:
from sklearn.linear_model import Lasso

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_bow_tf = vectorizer1.transform(temp_train['Synopsis2'])
    temp_val_bow_tf = vectorizer1.transform(temp_val['Synopsis2'])
    
    temp_train_tfidf_tf = vectorizer2.transform(temp_train['Title2'])
    temp_val_tfidf_tf = vectorizer2.transform(temp_val['Title2'])
    
    temp_train_tf = sparse.hstack((temp_train_bow_tf,temp_train_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))
    temp_val_tf = sparse.hstack((temp_val_bow_tf,temp_val_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))
    
    lassoreg = Lasso(alpha=0.0001,normalize=True, max_iter=1e6)
    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = lassoreg.predict(test_tf)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + lassoreg.predict(test_tf)

    IterationNum = IterationNum + 1

In [None]:
print("LASSO 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LASSO'] = sub_data_preds
test['Price_Log_Pred_LASSO'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Stack24_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Stack24_DS.csv", index=False)

In [None]:
import xgboost as xgb

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_bow_tf = vectorizer1.transform(temp_train['Synopsis2'])
    temp_val_bow_tf = vectorizer1.transform(temp_val['Synopsis2'])
    
    temp_train_tfidf_tf = vectorizer2.transform(temp_train['Title2'])
    temp_val_tfidf_tf = vectorizer2.transform(temp_val['Title2'])
    
    temp_train_tf = sparse.hstack((temp_train_bow_tf,temp_train_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))
    temp_val_tf = sparse.hstack((temp_val_bow_tf,temp_val_tfidf_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))
    
    dtrain = xgb.DMatrix(data = temp_train_tf, label = temp_train['Price_Log'])
    dtest = xgb.DMatrix(data = temp_val_tf, label = temp_val['Price_Log'])
                
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    
    params1 = { 'seed': 501,
                'colsample_bytree': 0.1,
                'verbosity': 1,
                'subsample': 0.95,
                'learning_rate': 0.1,
                'objective': 'reg:squarederror',
                'max_depth': 10,
                'min_child_weight': 3,
                'booster': 'gbtree',
                'eval_metric': 'rmse' }
    
    XGB = xgb.train(params = params1,
                    dtrain = dtrain,
                    num_boost_round = 10000,
                    evals = watchlist,
                    verbose_eval = 20,
                    early_stopping_rounds = 200)
    
    temp_val['Price_Log_Pred_XGB'] = XGB.predict(xgb.DMatrix(data = temp_val_tf), ntree_limit = XGB.best_ntree_limit)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_XGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = XGB.predict(xgb.DMatrix(data = test_tf), ntree_limit = XGB.best_ntree_limit)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + XGB.predict(xgb.DMatrix(data = test_tf), ntree_limit = XGB.best_ntree_limit)

    IterationNum = IterationNum + 1

In [None]:
print("XGB 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_XGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_XGB'] = sub_data_preds
test['Price_Log_Pred_XGB'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Stack24_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Stack24_DS.csv", index=False)

In [None]:
feature_names_ensemble = list(training_cv_predictions.columns[4:])

In [None]:
import seaborn as sns

In [None]:
corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()
ax = sns.heatmap(
    corr, 
    vmin=0, vmax=1, center=0.5,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]
    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]
    
    ridgereg_e = Ridge(alpha=0.0001,normalize=True)
    ridgereg_e.fit(temp_train[feature_names_ensemble],temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LGB_ENS'] = ridgereg_e.predict(temp_val[feature_names_ensemble])
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg_e.predict(test[feature_names_ensemble])
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg_e.predict(test[feature_names_ensemble])

    IterationNum = IterationNum + 1

In [None]:
print("LGB ENS CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB_ENS'] = sub_data_preds
test['Price_Log_Pred_LGB_ENS'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Stack24_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Stack24_DS.csv", index=False)

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

In [None]:
sub_data_preds2 = (10**test['Price_Log_Pred_LGB_ENS'].values) - 1
pd.DataFrame(sub_data_preds2).describe()

In [None]:
submission['Price'] = sub_data_preds2
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191025_Stack24_DS.xlsx', index=False)