### Load data and Make PCA, SVM and XGboost model

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# load only column names for the data.
all_features = pd.read_csv("/Users/jw/Downloads/final_combined.csv")

# Write out function for getting different subset of features.  
### Total three groups, all_similarity_features, all_count_features, len_entropy_features

In [3]:
def getAllNumericalCols(all_features):
    """
    param: all_features is a data frame containning all features.
    output: column names of all numerical features.
    """
    col_names = all_features.columns.tolist()
    all_num_ind = [15]+list(range(25,len(col_names)))
    all_num_col = [col_names[i] for i in all_num_ind]
    
    return all_num_col

In [4]:
all_num_features = all_features[getAllNumericalCols(all_features)]
all_num_features.head(3)

Unnamed: 0,min_levenstein_dist_brand,clean_length,title_length,desc_length,clean_terms_in_title,clean_terms_in_desc,stemmed_terms_in_title,stemmed_terms_in_desc,lemmatized_terms_in_title,lemmatized_terms_in_desc,...,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title,num_words_in_description,num_stop_words,num_search_words,tfidf_search_common,num_attrib
0,1000,2.0,4.0,129.0,1.0,0.0,1.0,1.0,1.0,1.0,...,2.833333,0.866667,1.0,4.0,0.107077,79,0,2,1,15.0
1,1000,1.0,4.0,129.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107077,79,0,2,0,15.0
2,0,1.0,11.0,168.0,0.0,0.0,0.0,1.0,0.0,1.0,...,2.711111,0.0,0.0,3.0,0.109091,109,1,2,1,35.0


In [5]:
def getSimilarityCols(all_num_features):
    """
    param: all_features is a data frame containning all numerical features.
    output: column names of all similarity features.
    """
    all_similarity_features = [all_num_features.columns.tolist()[i] for i in [0,14,15,16,17,18,19,20,21,22,26]]
    return all_similarity_features

In [6]:
all_similarity_features = all_features[getSimilarityCols(all_num_features)]
all_similarity_features.head(3)

Unnamed: 0,min_levenstein_dist_brand,jaccard_index_title,jaccard_index_desc,lcs_title,lcs_desc,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title,tfidf_search_common
0,1000,0.166667,0.0,6,13,2.833333,0.866667,1.0,4.0,0.107077,1
1,1000,0.0,0.0,3,7,0.0,0.0,0.0,0.0,0.107077,0
2,0,0.0,0.012048,4,4,2.711111,0.0,0.0,3.0,0.109091,1


In [7]:
def getCountAndOtherCols(all_similarity_features,all_num_features):
    """
    return the column names of all count features and len_Entropy columns.
    """
    all_other_num_cols = set(all_num_features.columns.tolist()).difference(set(all_similarity_features.columns.tolist()))
    col_has_in = [i for i in all_other_num_cols if "in" in i]
    len_H_features = list(set(all_other_num_cols).difference(set(col_has_in)))
    
    return col_has_in, len_H_features
  

In [8]:
count_cols, len_h_cols = getCountAndOtherCols(all_similarity_features,all_num_features)
all_count_features = all_features[count_cols]
len_entropy_features = all_features[len_h_cols]

## Get the AWS working on my NCD for query and desc.

In [17]:
"search_term" in all_features.columns.tolist()
"product_description" in all_features.columns.tolist() 
raw_df = pd.DataFrame(all_features[["search_term","product_description"]])

### Plan on the modelling plan.

Start from simple features and estimated on each model, like SVM.LinearSVR or LinearRegression.  
Include only numerical attributes, exclude 0-4.  

In [84]:
from sklearn.svm import LinearSVR
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler

In [31]:
label = all_features["relevance"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(all_num_features, label)

In [42]:
X_train = np.nan_to_num(X_train)

array([[1000.,    3.,   14., ...,    8.,    2.,   30.],
       [1000.,    3.,    9., ...,    3.,    1.,   22.],
       [1000.,    2.,   11., ...,    4.,    2.,    0.],
       ...,
       [1000.,    2.,   16., ...,    2.,    1.,    0.],
       [1000.,    4.,   10., ...,    5.,    2.,    0.],
       [   3.,    3.,   15., ...,    3.,    1.,   12.]])

In [163]:
def runLinearModels(df,label):
    """
    param: pandas data frame with numerical columns.
    return: the best model.
    """
    df = standaridize(df)
    
    X_train, X_test, y_train, y_test = train_test_split(df, label)
    # Add a quick fix for nan, inf. 
    X_train = np.nan_to_num(X_train)
    X_test = np.nan_to_num(X_test)
    
    #For linear regression.
    lr = linear_model.LinearRegression()
    lr_params = {"fit_intercept":[True,False],'normalize':[True,False],"n_jobs":[-1]}
    lr_gs = GridSearchCV(lr,n_jobs=-1,cv=5,param_grid=lr_params)
    lr_gs.fit(X_train,y_train)
    lr_predicted = lr_gs.predict(X_test)
    lr_rmse = math.sqrt(mean_squared_error(lr_predicted, y_test))

    #For Lasso, the key parameter to tune on is alpha, and it is numerical within range of 0 and 1.
    lsso = linear_model.Lasso(random_state=42,tol=0.001)
    lsso_params = {"fit_intercept":[True,False],'normalize':[True,False],"alpha":list(np.linspace(0.1,1.5,num=20))}
    lsso_gs = GridSearchCV(lsso,n_jobs=-1,cv=5,param_grid=lsso_params)
    lsso_gs.fit(X_train,y_train)
    lsso_predicted = lsso_gs.predict(X_test)
    lsso_rmse = math.sqrt(mean_squared_error(lsso_predicted, y_test))

    #For Ridge, the key parameter is alpha.
    ridge = linear_model.Ridge(random_state=42,tol=0.001)
    ridge_params = {"fit_intercept":[True,False],'normalize':[True,False],"alpha":list(np.linspace(0.1,4,num=40))}
    ridge_gs = GridSearchCV(ridge,n_jobs=-1,cv=5,param_grid=ridge_params)
    ridge_gs.fit(X_train,y_train)
    ridge_predicted = ridge_gs.predict(X_test)
    ridge_rmse = math.sqrt(mean_squared_error(ridge_predicted, y_test))

    #For Elastic nets, the key parameter is alpha and l1_ratio, both of them are numerical.
    elNet = linear_model.ElasticNet(random_state=42,tol=0.001)
    elNet_params = {"fit_intercept":[True,False],'normalize':[True,False],"alpha":list(np.linspace(0.1,1,num=20)),"l1_ratio":list(np.linspace(0,1,num=10))}
    elNet_gs = GridSearchCV(elNet,n_jobs=-1,cv=5,param_grid=elNet_params)
    elNet_gs.fit(X_train,y_train)
    elNet_predicted = elNet_gs.predict(X_test)
    elNet_rmse = math.sqrt(mean_squared_error(elNet_predicted, y_test))
    
    all_rmse = [lr_rmse, lsso_rmse, ridge_rmse, elNet_rmse] 
    model_names = [lr_gs, lsso_gs, ridge_gs, elNet_gs]
    best_model = dict(zip(model_names, all_rmse))

    return best_model

In [164]:
# Use smallest data set to train. Best performance is LinearRegression.
# This runs 2m 38s. 
linear_family = runLinearModels(len_entropy_features,label)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [165]:
rmse_linear_fam1 = [i[1] for i in a]

In [166]:
# get all linear family regression.

linear_family2 = runLinearModels(all_count_features,label)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [167]:
linear_family3 = runLinearModels(all_similarity_features,label)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [169]:
linear_family4 = runLinearModels(all_num_features,label)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [170]:
def standaridize(df):
    """
    Given a vector or a matrix, use the standardize to modify the data. Useful in SVM, L1/L2 regulared linear regression.
    return: standaridized data set.
    """
    if len(np.array(df).shape) == 1:
        df = np.array(df).reshape(-1,1)
    scaler = StandardScaler()
    std_df = scaler.fit_transform(df)
    return std_df

In [171]:
def runSVMregession(df,label):
    """
    Doing SVM regression on large data set require specify the cache size, scale input X data to [0,1]/[-1,1]
        avoid 
    param: pandas data frame with numerical columns.
    return: the best model.
    """
    # Scale the df.
    df = standaridize(df)
    X_train, X_test, y_train, y_test = train_test_split(df, label)
    # Add a quick fix for nan, inf. 
    X_train = np.nan_to_num(X_train)
    X_test = np.nan_to_num(X_test)
    
    svm = LinearSVR(random_state=42,max_iter=5000,C=100,epsilon=0.075)
    svm_regression_params = {"tol":[10**(-4),10**(-5)],'C':[1,10,20,100,300],"epsilon":[0.1,0.2,0.075],"max_iter":[4000,5000,10000]}
    svm_gs = GridSearchCV(svm,n_jobs=-1,cv=3,param_grid=svm_regression_params)
    svm_gs.fit(X_train,y_train)
    svm_predicted = svm_gs.predict(X_test)
    svm_rmse = math.sqrt(mean_squared_error(svm_predicted,y_test))
    
    return [svm_gs,svm_rmse]

In [None]:
svm_ls = runSVMregession(len_entropy_features,label)
svm_ls

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
svm_ls2 = runSVMregession(all_count_features,label)

In [None]:
svm_ls2

In [None]:
svm_ls3 = runSVMregession(all_similarity_features,label)

In [None]:
svm_ls3

In [None]:
svm_ls4 = runSVMregession(all_num_features,label)

In [None]:
svm_ls4

In [None]:
# collect all svm performance.
rmse_svm = [i[1] for i in [svm_ls,svm_ls2,svm_ls3,svm_ls4]]

In [152]:
rmse_svm

[0.5562035877884243,
 0.5254573621899448,
 0.5076691880561738,
 0.5418303047801409]

In [12]:
def runXGBoost(df):
    """
    ref: https://xgboost.readthedocs.io/en/latest/python/python_intro.html
    param: pandas data frame with numerical columns.
    return: the best model.
    """
    
    # specify parameters via map, definition are same as c++ version
    param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}

    # specify validations set to watch performance
    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
    num_round = 2
    bst = xgb.train(param, dtrain, num_round, watchlist)

    # this is prediction
    preds = bst.predict(dtest)
    labels = dtest.get_label()
    
    

## Doing visualization.

In [158]:
import matplotlib.pyplot as plt

In [None]:
#plot line graph for each features space.

features_ls = ["length&Entropy","Counts features","Similarity scores","All Numerical Features!"]
color_ls = ['green','blue','yellow','orange','red']
model_ls = ["Linear Regression","Lasso","Ridge","Elastic Net","SVM regression"]
rmse_ls = [rmse_lr,rmse_lasso,rmse_ridge,rmse_elNet,rmse_svm]

#for i in range(len(model_ls)):
#    plt.plot(features_ls,rmse_ls[i], color=color_ls[i])

plt.plot(features_ls,rmse_svm, color=color_ls[4])
plt.xlabel('Feature Spaces')
plt.ylabel('Model Name')
plt.title('Line graph of RMSE on Various Feature Spaces')
plt.show()