# Data Slinging Slasher

**DATA PREPROCESSING**



1.   I Putu Agastya Harta P. (Feature Encoding)	
2.  Arsya Chairani (Feature Engineering)
1.   Almas Rausan F. (Feature Engineering & Handling Outliers)
2.   Ahdan A.R. (Feature Transformation)	
1.   Ahmad Afif M. (Feature Engineering)	
2.   Alfredo M. (Feature Transformation)
1.   Riyandhika W. (Handling Class Imbalance)
		


In [1]:
import warnings
warnings.filterwarnings('ignore')

# Library for EDA & Visualisasi
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.ticker as mtick
style.use('fivethirtyeight')

# Library Preprocessing
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn import over_sampling

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc, classification_report
from sklearn.metrics import ConfusionMatrixDisplay



In [2]:
'''#Mount Files
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1yvAnU5w-ybsZ9T-p78ObLRgH6ZnK75Lx'
download = drive.CreateFile({'id': file_id})'''

#Read File
df = pd.read_csv('Train.csv')

In [3]:
# Misahin feature & target variable
X = df.drop(['ID', 'Reached.on.Time_Y.N'], axis = 1)
y = df['Reached.on.Time_Y.N']

X.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,D,Flight,4,2,177,3,low,F,44,1233
1,F,Flight,4,5,216,2,low,M,59,3088
2,A,Flight,2,2,183,4,low,M,48,3374
3,B,Flight,3,3,176,4,medium,M,10,1177
4,C,Flight,2,2,184,3,medium,F,46,2484


# Feature Engineering

## 1. Profit

Di datasetnya tuh ada tulisan: "The company sells electronic products"

Average profit margin electronic product store itu 3.5% kalau berdasarkan investopedia https://www.investopedia.com/ask/answers/051215/what-profit-margin-average-company-electronics-sector.asp

In [4]:
def profit_fe(data):
    df = data.copy()
    profit_margin = 0.035
    df['profit'] = df['Cost_of_the_Product'] * profit_margin
    df['profit_minus_discount'] = df['profit'] - df['Cost_of_the_Product'] * df['Discount_offered'] / 100
    return df

## 2. Shipping Cost

https://www.worldbank.org/en/topic/transport/publication/air-freight-study#markets

Air freight rates generally range from $1.50–$4.50 per kilogram. The demand for air freight is limited by cost, typically priced 4–5 times that of road transport and 12–16 times that of sea transport. 

Informasi: 
1. Air freight rates generally range from 1.50–4.50 per kilogram -> 3
2. Air freight priced 4–5 times that of road transport -> Road transport = Air freight / 4 = 0.75
3. Air freight priced 12–16 times that of sea transport -> Sea transport = Air freight / 15 = 0.20

In [5]:
def shipping_cost(x):
    if x == 'Flight':
        return 3.0
    elif x == 'Road':
        return 0.75
    else: return 0.20
    
def shipping_cost_fe(data):
    df = data.copy()
    df['shipping_cost'] = df['Mode_of_Shipment'].apply(lambda x: shipping_cost(x)) * df['Weight_in_gms'] / 1000
    return df

## 3. Expensive and important

Butuh penjelasan ni

In [6]:
def expensive_important_fe(data):
    df = data.copy()
    q3_price = df['Cost_of_the_Product'].quantile(0.75)
    df['is_expensive'] = df['Cost_of_the_Product'].apply(lambda x: 1 if x > q3_price else 0)
    df['is_important'] = df['Product_importance'].apply(lambda x: 1 if x == 'high' else 0)
    df['expensive_important'] = df.apply(lambda x: 1 if x['is_expensive'] & x['is_important'] else 0, axis = 1)
    return df

## 4. Repeat order

In [7]:
df['Prior_purchases'].value_counts()

3     3955
2     2599
4     2155
5     1287
6      561
10     178
7      136
8      128
Name: Prior_purchases, dtype: int64

In [8]:
def frequent_fe(data):
    df = data.copy()
    q1_prior = df['Prior_purchases'].quantile(0.25)
    q3_prior = df['Prior_purchases'].quantile(0.75)
    df['cust_frequency'] = pd.cut(df['Prior_purchases'], bins = [-np.inf, q1_prior, q3_prior, np.inf], labels=[0, 1, 2])
    df['cust_frequency'] = df['cust_frequency'].astype(int)
    return df

In [9]:
X = profit_fe(X)
X = shipping_cost_fe(X)
X = expensive_important_fe(X)
X = frequent_fe(X)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Warehouse_block        10999 non-null  object 
 1   Mode_of_Shipment       10999 non-null  object 
 2   Customer_care_calls    10999 non-null  int64  
 3   Customer_rating        10999 non-null  int64  
 4   Cost_of_the_Product    10999 non-null  int64  
 5   Prior_purchases        10999 non-null  int64  
 6   Product_importance     10999 non-null  object 
 7   Gender                 10999 non-null  object 
 8   Discount_offered       10999 non-null  int64  
 9   Weight_in_gms          10999 non-null  int64  
 10  profit                 10999 non-null  float64
 11  profit_minus_discount  10999 non-null  float64
 12  shipping_cost          10999 non-null  float64
 13  is_expensive           10999 non-null  int64  
 14  is_important           10999 non-null  int64  
 15  ex

In [11]:
X.columns.tolist()

['Warehouse_block',
 'Mode_of_Shipment',
 'Customer_care_calls',
 'Customer_rating',
 'Cost_of_the_Product',
 'Prior_purchases',
 'Product_importance',
 'Gender',
 'Discount_offered',
 'Weight_in_gms',
 'profit',
 'profit_minus_discount',
 'shipping_cost',
 'is_expensive',
 'is_important',
 'expensive_important',
 'cust_frequency']

In [12]:
selected_cols = ['Warehouse_block',
                 'Mode_of_Shipment',
                 'Customer_care_calls',
                 'Gender',
                 'Discount_offered',
                 'Weight_in_gms',
                 'profit',
                 'profit_minus_discount',
                 'shipping_cost',
                 'is_expensive',
                 'is_important',
#                 'expensive_important',
                 'cust_frequency']

In [13]:
X = X[selected_cols]

In [14]:
# Train & test pisah
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Preprocessing

Scaling numerik

In [15]:
num_cols = ['Customer_care_calls', 'Discount_offered', 'Weight_in_gms', 'profit', 'profit_minus_discount', 'shipping_cost']
sc = StandardScaler()

X_train[num_cols] = sc.fit_transform(X_train[num_cols])

Encoding categorical

In [16]:
X_train.select_dtypes('O').head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Gender
7920,D,Road,F
1529,F,Ship,F
10521,B,Ship,M
9558,D,Ship,F
968,A,Flight,M


In [17]:
final_cols = ['Customer_care_calls',
 'Discount_offered',
 'Weight_in_gms',
 'profit',
 'profit_minus_discount',
 'shipping_cost',
 'is_expensive',
 'is_important',
 'cust_frequency',
 'Warehouse_block:B',
 'Warehouse_block:C',
 'Warehouse_block:D',
 'Warehouse_block:F',
 'Mode_of_Shipment:Road',
 'Mode_of_Shipment:Ship',
 'Gender']

X_train = pd.get_dummies(X_train, prefix_sep=':', drop_first=True)
X_train.columns = final_cols
X_train

Unnamed: 0,Customer_care_calls,Discount_offered,Weight_in_gms,profit,profit_minus_discount,shipping_cost,is_expensive,is_important,cust_frequency,Warehouse_block:B,Warehouse_block:C,Warehouse_block:D,Warehouse_block:F,Mode_of_Shipment:Road,Mode_of_Shipment:Ship,Gender
7920,-0.043121,-0.270466,0.322365,1.056631,0.162057,0.108009,1,0,0,0,0,1,0,1,0,0
1529,-0.921405,0.782420,-1.206929,-1.003783,-0.509747,-0.562238,0,0,0,0,0,0,1,0,1,0
10521,-0.921405,-0.270466,0.510849,0.224140,0.228949,-0.427212,0,0,0,1,0,0,0,0,1,1
9558,-0.921405,-0.704008,0.617330,-1.107845,0.670129,-0.418842,0,0,1,0,0,1,0,0,1,0
968,-1.799688,0.658551,0.004144,1.285566,-1.096872,1.982955,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,-0.043121,-0.270466,0.688930,-0.129668,0.257378,0.216061,0,0,0,1,0,0,0,1,0,0
4876,-0.043121,-0.394335,0.706065,0.557137,0.346313,-0.411867,0,0,0,0,1,0,0,0,1,0
3708,-0.043121,-0.456270,0.878638,-1.461653,0.492106,3.014049,0,0,1,0,0,1,0,0,0,1
6201,-0.043121,-0.456270,0.455772,0.848508,0.407732,-0.431542,0,0,0,1,0,0,0,0,1,0


In [18]:
def preprocessing(data):
    df = data.copy()
    df[num_cols] = sc.transform(df[num_cols])
    
    df = pd.get_dummies(df, prefix_sep=':', drop_first=True)
    df.columns = final_cols
    return df

In [19]:
'''X_train_exp = X_train.copy()

X_train_exp['arrived_late'] = y_train.copy()

X_test = preprocessing(X_test)
X_test_exp = X_test.copy()
X_test_exp['arrived_late'] = y_test.copy()

X_train_exp.to_csv('trainfe.csv', index = False)
X_test_exp.to_csv('testfe.csv', index = False)'''

# Modelling pakai data ini