In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.ticker as mtick
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc, classification_report
# from sklearn.metrics import ConfusionMatrixDisplay

style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('Train.csv')
X = df.drop(['ID', 'Reached.on.Time_Y.N'], axis = 1)
y = df['Reached.on.Time_Y.N']

X.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,D,Flight,4,2,177,3,low,F,44,1233
1,F,Flight,4,5,216,2,low,M,59,3088
2,A,Flight,2,2,183,4,low,M,48,3374
3,B,Flight,3,3,176,4,medium,M,10,1177
4,C,Flight,2,2,184,3,medium,F,46,2484


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
stat_desc = X_train.describe().T
stat_desc.columns = ['count', 'mean', 'std', 'min', 'q1', 'median', 'q3', 'max']
stat_desc['IQR'] = stat_desc['q3'] - stat_desc['q1']
stat_desc

Unnamed: 0,count,mean,std,min,q1,median,q3,max,IQR
Customer_care_calls,8799.0,4.049096,1.138649,2.0,3.0,4.0,5.0,7.0,2.0
Customer_rating,8799.0,2.997386,1.411194,1.0,2.0,3.0,4.0,5.0,2.0
Cost_of_the_Product,8799.0,210.230367,48.051317,96.0,170.0,214.0,251.0,310.0,81.0
Prior_purchases,8799.0,3.581998,1.532722,2.0,3.0,3.0,4.0,10.0,1.0
Discount_offered,8799.0,13.366974,16.147015,1.0,4.0,7.0,10.0,65.0,6.0
Weight_in_gms,8799.0,3631.227867,1634.180615,1001.0,1838.0,4140.0,5045.0,7846.0,3207.0


# 1. Outlier based on IQR

In [5]:
iqr = stat_desc.copy()

# Max right value = Q3 + 1.5 IQR
iqr['max_right'] = iqr['q3'] + 1.5 * iqr['IQR']

# Max left value = Q1 - 1.5 IQR
iqr['max_left'] = iqr['q1'] - 1.5 * iqr['IQR']

iqr['left_outlier'] = iqr['min'] < iqr['max_left']
iqr['right_outlier'] = iqr['max'] > iqr['max_right']

iqr

Unnamed: 0,count,mean,std,min,q1,median,q3,max,IQR,max_right,max_left,left_outlier,right_outlier
Customer_care_calls,8799.0,4.049096,1.138649,2.0,3.0,4.0,5.0,7.0,2.0,8.0,0.0,False,False
Customer_rating,8799.0,2.997386,1.411194,1.0,2.0,3.0,4.0,5.0,2.0,7.0,-1.0,False,False
Cost_of_the_Product,8799.0,210.230367,48.051317,96.0,170.0,214.0,251.0,310.0,81.0,372.5,48.5,False,False
Prior_purchases,8799.0,3.581998,1.532722,2.0,3.0,3.0,4.0,10.0,1.0,5.5,1.5,False,True
Discount_offered,8799.0,13.366974,16.147015,1.0,4.0,7.0,10.0,65.0,6.0,19.0,-5.0,False,True
Weight_in_gms,8799.0,3631.227867,1634.180615,1001.0,1838.0,4140.0,5045.0,7846.0,3207.0,9855.5,-2972.5,False,False


In [6]:
filterprior = df['Prior_purchases'] <= iqr['max_right'].Prior_purchases
filterdisc = df['Discount_offered'] <= iqr['max_right'].Discount_offered

iqrfiltered = X_train[filterprior & filterdisc]
iqrfiltered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6341 entries, 7920 to 7187
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Warehouse_block      6341 non-null   object
 1   Mode_of_Shipment     6341 non-null   object
 2   Customer_care_calls  6341 non-null   int64 
 3   Customer_rating      6341 non-null   int64 
 4   Cost_of_the_Product  6341 non-null   int64 
 5   Prior_purchases      6341 non-null   int64 
 6   Product_importance   6341 non-null   object
 7   Gender               6341 non-null   object
 8   Discount_offered     6341 non-null   int64 
 9   Weight_in_gms        6341 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 544.9+ KB


  after removing the cwd from sys.path.


# 2. Set Cap at 90th Percentile

In [7]:
cap = X_train.copy()
capdesc = cap.describe(percentiles = [.10,.50,.90]).T
capdesc

Unnamed: 0,count,mean,std,min,10%,50%,90%,max
Customer_care_calls,8799.0,4.049096,1.138649,2.0,3.0,4.0,6.0,7.0
Customer_rating,8799.0,2.997386,1.411194,1.0,1.0,3.0,5.0,5.0
Cost_of_the_Product,8799.0,210.230367,48.051317,96.0,144.0,214.0,269.0,310.0
Prior_purchases,8799.0,3.581998,1.532722,2.0,2.0,3.0,5.0,10.0
Discount_offered,8799.0,13.366974,16.147015,1.0,2.0,7.0,43.0,65.0
Weight_in_gms,8799.0,3631.227867,1634.180615,1001.0,1326.8,4140.0,5632.0,7846.0


In [8]:
prior_90 = capdesc['90%'].Prior_purchases
disc_90 = capdesc['90%'].Discount_offered

capfiltered = X_train.copy()

capfiltered['Prior_purchases'] = capfiltered['Prior_purchases'].apply(lambda x: prior_90 if x > prior_90 else x)
capfiltered['Discount_offered'] = capfiltered['Discount_offered'].apply(lambda x: disc_90 if x > disc_90 else x)

capfiltered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8799 entries, 7920 to 7187
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      8799 non-null   object 
 1   Mode_of_Shipment     8799 non-null   object 
 2   Customer_care_calls  8799 non-null   int64  
 3   Customer_rating      8799 non-null   int64  
 4   Cost_of_the_Product  8799 non-null   int64  
 5   Prior_purchases      8799 non-null   float64
 6   Product_importance   8799 non-null   object 
 7   Gender               8799 non-null   object 
 8   Discount_offered     8799 non-null   float64
 9   Weight_in_gms        8799 non-null   int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 756.2+ KB


# To_csv()

In [9]:
print(X_train.columns.tolist())

['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms']


In [10]:
X_train.to_csv('X_train.csv', index = False)
iqrfiltered.to_csv('Train_IQR_filtered.csv', index = False)
capfiltered.to_csv('Train_capfiltered.csv', index = False)

# Tambahkan lagi y_train ke data untuk imbalance

In [11]:
capfiltered['capfiltered_class'] = y_train.copy().reindex(capfiltered.index)
capfiltered['capfiltered_class'].value_counts()

1    5250
0    3549
Name: capfiltered_class, dtype: int64

In [12]:
iqrfiltered['iqr_filtered_class'] = y_train.copy().reindex(iqrfiltered.index)
iqrfiltered['iqr_filtered_class'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0    3200
1    3141
Name: iqr_filtered_class, dtype: int64

In [13]:
X_train['X_train_class'] = y_train.copy().reindex(X_train.index)
X_train['X_train_class'].value_counts()

1    5250
0    3549
Name: X_train_class, dtype: int64