In [1]:
import pandas as pd
import pickle
import random
import re
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
random_index = random.choices(list(test.index),k =1)

In [5]:
query_point = test.iloc[random_index].values

### Importing Preprocessing data using pickle

In [6]:
#Columns which need to be removed.
removable_columns = pickle.load(open('Removable_col','rb'))

In [7]:
#Interaction features which need to be added.
interaction_columns = pickle.load(open('Corr-features','rb'))

In [8]:
# Column names which need to be added to make interaction features
corr_columns = []
for i in interaction_columns:
    col = re.sub('_add_',' ',i)
    corr_columns.append(col.split(' '))

In [9]:
#Loading pca
pca = pickle.load(open('models/pca.sav','rb'))
#Loading pca scaler
pca_scaler = pickle.load(open('models/pca-scaler.sav','rb'))



In [10]:
#Loading classes mean used for mean encoding.
default_mean = pickle.load(open('class-means/default_mean','rb'))
train_X0 = pickle.load(open('class-means/mean_X0','rb'))
train_X1 = pickle.load(open('class-means/mean_X1','rb'))
train_X2 = pickle.load(open('class-means/mean_X2','rb'))
train_X3 = pickle.load(open('class-means/mean_X3','rb'))
train_X5 = pickle.load(open('class-means/mean_X5','rb'))
train_X6 = pickle.load(open('class-means/mean_X6','rb'))
train_X8 = pickle.load(open('class-means/mean_X8','rb'))
#Mean scaler
mean_scaler = pickle.load(open('models/mean-scaler.sav','rb'))

### Importing the best model from the models

In [11]:
mean_correlated_extratrees = pickle.load(open('models/mean_corr_pca_et_final.sav','rb'))



In [12]:
def prediction(query):
    
    query = pd.DataFrame(query,columns = test.columns)
    
    #Removing the columns
    query.drop(removable_columns,axis = 1,inplace = True)
    
    #Getting Remaining Numerical columns for pca transformation
    numeric = list(train.select_dtypes(include = 'int64').columns)
    numeric_col = []
    for i in numeric:
        if i in query.columns:
            numeric_col.append(i)
            
    query_numeric = query[numeric_col]
    query_numeric_norm = pca_scaler.transform(query_numeric)
    query_numeric_norm = pd.DataFrame(query_numeric_norm,columns = query_numeric.columns)
    
    #Mean Encoding of categorical variables
    query['X0'] = [train_X0.loc[i] if i in train_X0.index else default_mean for i in query['X0'].values]
    query['X1'] = [train_X1.loc[i] if i in train_X1.index else default_mean for i in query['X1'].values]
    query['X2'] = [train_X2.loc[i] if i in train_X2.index else default_mean for i in query['X2'].values]
    query['X3'] = [train_X3.loc[i] if i in train_X3.index else default_mean for i in query['X3'].values]
    query['X5'] = [train_X5.loc[i] if i in train_X5.index else default_mean for i in query['X5'].values]
    query['X6'] = [train_X6.loc[i] if i in train_X6.index else default_mean for i in query['X6'].values]
    query['X8'] = [train_X8.loc[i] if i in train_X8.index else default_mean for i in query['X8'].values]
    
    #Adding the interaction features
    for cols in corr_columns: 
        if len(cols) == 2:
            query[str(cols[0])+'_add_'+str(cols[1])] = query[cols[0]]+query[cols[1]]
        else:
            query[str(cols[0])+'_add_'+str(cols[1])+'_add_'+str(cols[2])] = query[cols[0]]+query[cols[1]]+query[cols[2]]
    
    query_norm = mean_scaler.transform(query)
    query_norm = pd.DataFrame(query_norm,columns = query.columns)
    
    #Adding PCA features
    query_pca = pca.transform(query_numeric_norm)
    for i in range(1,7):
        query_norm['pca'+str(i)] = query_pca[:,i-1]
        query_norm['pca'+str(i)] = query_pca[:,i-1]
    
    #Predicting the target value
    predicted = mean_correlated_extratrees.predict(query_norm)
    return predicted[0]

In [13]:
target_predicted = prediction(query_point)

In [14]:
print('The predicted target value for given query point is ',target_predicted,' seconds')

The predicted target value for given query point is  75.86102607550015  seconds


#### Calculating the metrics

In [15]:
#Taking 10 sample points from the train dataset to calculate R2 score.
random_index = random.choices(list(train.index),k = 10)

In [16]:
#Getting random points from dataset which will be a numpy array
random_points = train.iloc[random_index].values

In [17]:
def metric(input):
    
    df = pd.DataFrame(input,columns = train.columns)
    y_true = df['y'].values
    df.drop(['y'],axis = 1,inplace = True)
    
    #Removing the columns
    df.drop(removable_columns,axis = 1,inplace = True)
    
    #Getting Remaining Numerical columns for pca transformation
    numeric = list(train.select_dtypes(include = 'int64').columns)
    numeric_col = []
    for i in numeric:
        if i in df.columns:
            numeric_col.append(i)

    df_numeric = df[numeric_col]
    df_numeric_norm = pca_scaler.transform(df_numeric)
    df_numeric_norm = pd.DataFrame(df_numeric_norm,columns = df_numeric.columns)
    
    #Mean Encoding of categorical variables
    df['X0'] = [train_X0.loc[i] for i in df['X0'].values]
    df['X1'] = [train_X1.loc[i] for i in df['X1'].values]
    df['X2'] = [train_X2.loc[i] for i in df['X2'].values]
    df['X3'] = [train_X3.loc[i] for i in df['X3'].values]
    df['X5'] = [train_X5.loc[i] for i in df['X5'].values]
    df['X6'] = [train_X6.loc[i] for i in df['X6'].values]
    df['X8'] = [train_X8.loc[i] for i in df['X8'].values]
    
    #Adding the interaction features
    for cols in corr_columns: 
        if len(cols) == 2:
            df[str(cols[0])+'_add_'+str(cols[1])] = df[cols[0]]+df[cols[1]]
        else:
            df[str(cols[0])+'_add_'+str(cols[1])+'_add_'+str(cols[2])] = df[cols[0]]+df[cols[1]]+df[cols[2]]
    
    df_norm = mean_scaler.transform(df)
    df_norm = pd.DataFrame(df_norm,columns = df.columns)
    
    
    #Adding PCA features
    df_pca = pca.transform(df_numeric_norm)
    for i in range(1,7):
        df_norm['pca'+str(i)] = df_pca[:,i-1]
        df_norm['pca'+str(i)] = df_pca[:,i-1]

        
    #Predicting the target value
    y_pred = mean_correlated_extratrees.predict(df_norm)
    
    #Calculating r2 score:
    R2_score = r2_score(y_true,y_pred)
    
    return R2_score

In [18]:
R2_score = metric(random_points)

In [19]:
print('The R2 score for given points is ',R2_score)

The R2 score for given points is  0.6830123625773366
