In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import pickle

In [2]:
# for model
with open('model.pkl','rb') as file_1:
    my_model = pickle.load(file_1)

# for capping
with open('capping.pkl', 'rb') as file_2:
    capping = pickle.load(file_2)

# for PCA
with open('pca_function.pkl', 'rb') as file_3:
    pca_fix = pickle.load(file_3)

# for preprocessing
with open('preprocessing.pkl', 'rb') as file_4:
  preprocessing = pickle.load(file_4)

# for pipeline
with open('pipeline.pkl', 'rb') as file_5:
  pipe_line = pickle.load(file_5)

In [3]:
# keep dataset in dataframe
 # load csv file
main_data = pd.read_csv('online_shoppers_intention.csv')

In [8]:
def create_inference_data(dataset):
    # define random seed
    np.random.seed(42)
    # keep nama kolom tiap dataset
    dataset_columns = dataset.columns.tolist()
    # create empty dictionary kosong
    data_output = {}
    # columns containing integer values
    dataset_int_columns = ['Administrative','Informational','ProductRelated','Browser','TrafficType']
    column_drop = ['OperatingSystems','ProductRelated_Duration','SpecialDay','Region','Administrative_Duration','Informational_Duration','Weekend','Revenue']
    for column in dataset_columns:
        if column == 'Month':
            item_value = np.random.choice(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep','Dec'],20)
            item_value_list = item_value.tolist()
            # insert items into columns
            data_output[column]=item_value_list
        elif column == 'VisitorType':
            item_value = np.random.choice(['Returning_Visitor', 'New_Visitor', 'Other'],20)
            item_value_list = item_value.tolist()
            # insert items into columns            
            data_output[column]=item_value_list
        elif column in column_drop:
            pass
        elif column in dataset_int_columns:
            max_value = dataset[column].max()
            min_value = dataset[column].min()
            item_value = np.random.randint(min_value,max_value,20)
            item_value_list = item_value.tolist()
            # insert items into columns
            data_output[column]=item_value_list
        else:
            # set the range of randomized data (nilai minimum dan maximum)
            max_value = dataset[column].max()
            min_value = dataset[column].min()
            item_value = np.random.uniform(min_value,max_value,20)
            # konversi ke list
            item_value_list = item_value.tolist()
            # masukkan item yang dirandom dalam key columns
            data_output[column]=item_value_list
    return data_output

In [9]:
# call the function
inf_data = create_inference_data(main_data)

# buat dataframe
inf_df = pd.DataFrame(inf_data)

# show values inside the dataframe
inf_df

Unnamed: 0,Administrative,Informational,ProductRelated,BounceRates,ExitRates,PageValues,Month,Browser,TrafficType,VisitorType
0,6,1,484,0.078212,0.193051,134.148598,Oct,9,9,New_Visitor
1,19,23,646,0.036447,0.121407,241.962514,June,11,17,New_Visitor
2,14,11,20,0.151072,0.0552,240.906564,Aug,9,17,Other
3,10,5,166,0.085031,0.059255,213.9101,Aug,4,12,Other
4,7,1,273,0.041588,0.033053,99.384384,Oct,9,7,Other
5,20,20,387,0.11354,0.003127,203.037522,Aug,3,2,Other
6,6,0,600,0.006263,0.08468,138.529059,May,7,3,Returning_Visitor
7,25,11,315,0.168457,0.078976,351.530204,Jul,6,17,Other
8,18,21,13,0.089951,0.058698,307.106242,Mar,8,5,New_Visitor
9,22,11,241,0.07903,0.002816,261.095572,Dec,11,17,Returning_Visitor


In [10]:
# define X inference
X_inf = inf_df

In [11]:
# capping
X_inf_capped =capping.transform(X_inf)

In [13]:
X_inf_capped

Unnamed: 0,Administrative,Informational,ProductRelated,BounceRates,ExitRates,PageValues,Month,Browser,TrafficType,VisitorType
0,6,1,484,0.078212,0.193051,86.65657,Oct,9,9,New_Visitor
1,19,23,646,0.036447,0.121407,86.65657,June,10,17,New_Visitor
2,14,11,20,0.151072,0.0552,86.65657,Aug,9,17,Other
3,10,5,166,0.085031,0.059255,86.65657,Aug,4,12,Other
4,7,1,273,0.041588,0.033053,86.65657,Oct,9,7,Other
5,20,20,387,0.11354,0.003127,86.65657,Aug,3,2,Other
6,6,0,600,0.006263,0.08468,86.65657,May,7,3,Returning_Visitor
7,25,11,315,0.168457,0.078976,86.65657,Jul,6,17,Other
8,18,21,13,0.089951,0.058698,86.65657,Mar,8,5,New_Visitor
9,22,11,241,0.07903,0.002816,86.65657,Dec,10,17,Returning_Visitor


In [18]:
# preprocess and encoding categorical data
X_inf_encoded_scaled = pipe_line.transform(X_inf_capped)

# return back to dataframe
column = ['Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep', 'VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Administrative', 'Informational', 'ProductRelated', 'BounceRates', 'ExitRates', 'PageValues', 'Browser', 'TrafficType']

# make dataframe
X_inf_encoded_scaled_final = pd.DataFrame(X_inf_encoded_scaled, columns=column)

In [20]:
# prepare for PCA
X_inf_for_pca = X_inf_encoded_scaled_final.iloc[:, 12:]

# drop for replacing with PCA
X_inf_encoded_scaled_pca_final = X_inf_encoded_scaled_final.drop(X_inf_encoded_scaled_final.columns[12:], axis=1)

In [25]:
# PCA
X_inf_scaled_pca = pca_fix.transform(X_inf_for_pca)

# make back into dataframe
pca_inf_df = pd.DataFrame(X_inf_scaled_pca, columns=[f"PC{i+1}" for i in range(X_inf_scaled_pca.shape[1])])

# concantenate all data into one dataframe
X_inf_combined = pd.concat([X_inf_encoded_scaled_pca_final , pca_inf_df], axis=1)

# show the dataset
X_inf_combined 

Unnamed: 0,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,VisitorType_New_Visitor,VisitorType_Other,PC1,PC2,PC3,PC4,PC5
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.513808,0.957888,0.390443,0.79133,0.825914
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.726467,0.56971,0.869264,0.729033,0.834747
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.650156,0.812516,0.814242,0.643607,0.876133
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.727461,0.588865,0.435135,0.195895,0.917109
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.851664,0.364357,0.365518,0.787969,0.765739
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.802725,0.348426,-0.079826,0.211256,1.01639
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.064746,-0.116876,0.163361,0.669459,0.843288
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.593602,0.853412,0.713097,0.32045,1.010122
8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.751652,0.522636,0.205838,0.701163,0.882645
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00592,-0.089873,0.960728,0.744481,0.874452


In [29]:
# make prediction
predictions = my_model.predict(X_inf_combined)

# convert prediction
converted_predictions = [bool(pred) for pred in predictions]

# make into dataframe
converted_prediction_df = pd.DataFrame(converted_predictions,columns = ['Prediction Result'])

In [30]:
# show prediction result
converted_prediction_df 

Unnamed: 0,Prediction Result
0,False
1,True
2,True
3,True
4,True
5,True
6,True
7,False
8,True
9,True
