In [None]:
import pandas as pd
import matplotlib
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load CSV files
df = pd.read_csv('JD_order_data.csv')

In [None]:
df.head()

Order Data:


Unnamed: 0,order_ID,user_ID,sku_ID,order_date,order_time,quantity,type,promise,original_unit_price,final_unit_price,direct_discount_per_unit,quantity_discount_per_unit,bundle_discount_per_unit,coupon_discount_per_unit,gift_item,dc_ori,dc_des
0,d0cf5cc6db,0abe9ef2ce,581d5b54c1,2018-03-01,2018-03-01 17:14:25.0,1,2,-,89.0,79.0,0.0,10.0,0.0,0.0,0,4,28
1,7444318d01,33a9e56257,067b673f2b,2018-03-01,2018-03-01 11:10:40.0,1,1,2,99.9,53.9,5.0,41.0,0.0,0.0,0,28,28
2,f973b01694,4ea3cf408f,623d0a582a,2018-03-01,2018-03-01 09:13:26.0,1,1,2,78.0,58.5,19.5,0.0,0.0,0.0,0,28,28
3,8c1cec8d4b,b87cb736cb,fc5289b139,2018-03-01,2018-03-01 21:29:50.0,1,1,2,61.0,35.0,0.0,26.0,0.0,0.0,0,4,28
4,d43a33c38a,4829223b6f,623d0a582a,2018-03-01,2018-03-01 19:13:37.0,1,1,1,78.0,53.0,19.0,0.0,0.0,6.0,0,3,16


In [None]:
# Define column groups based on your dataset
num_cols = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card",
            "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment",
            "Changed_Credit_Limit", "Num_Credit_Inquiries", "Outstanding_Debt", "Credit_Utilization_Ratio",
            "Total_EMI_per_month", "Amount_invested_monthly", "Monthly_Balance", "Credit_History_Age"]

cat_cols = ["Occupation", "Credit_Mix", "Spending_Level", "Payment_Value", "Payment_of_Min_Amount"]

target_col = ["Credit_Score"]


SKU Data:


Unnamed: 0,sku_ID,type,brand_ID,attribute1,attribute2,activate_date,deactivate_date
0,a234e08c57,1,c3ab4bf4d9,3.0,60.0,,
1,6449e1fd87,1,1d8b4b4c63,2.0,50.0,,
2,09b70fcd83,2,eb7d2a675a,3.0,70.0,,
3,acad9fed04,2,9b0d3a5fc6,3.0,70.0,,
4,2fa77e3b4d,2,b681299668,-,-,,


In [None]:
# Encode target variable (Credit Score)
label_encoder = LabelEncoder()
df["Credit_Score"] = label_encoder.fit_transform(df["Credit_Score"])


User Data:


Unnamed: 0,user_ID,user_level,first_order_month,plus,gender,age,marital_status,education,city_level,purchase_power
0,000089d6a6,1,2017-08,0,F,26-35,S,3,4,3
1,0000babd1f,1,2018-03,0,U,U,U,-1,-1,-1
2,0000bc018b,3,2016-06,0,F,>=56,M,3,2,3
3,0000d0e5ab,3,2014-06,0,M,26-35,M,3,2,2
4,0000dce472,3,2012-08,1,U,U,U,-1,-1,-1


In [None]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),  # Scale numerical features
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)  # Encode categorical features
])


Click Data:


Unnamed: 0.1,Unnamed: 0,sku_ID,user_ID,request_time
0,0,"['d829f03a28', 'd829f03a28', '725a03cfc9', 'ae...",e4e711238a,"['2018-02-28 23:59:01', '2018-02-28 23:59:47',..."
1,1,"['a0e49f9966', 'cdee05b50c', 'cdee05b50c', 'cd...",7a89b29ba5,"['2018-02-28 23:59:01', '2018-02-28 23:59:29',..."
2,2,"['6a0f1004bb', 'aad5dee442', 'fbce41fd82', 'b1...",e03f8c6d4e,"['2018-02-28 23:59:02', '2018-03-01 00:01:15',..."
3,3,"['2f268cf558', '5c15aed2ae', '5c15aed2ae', '5c...",ba189a22b7,"['2018-02-28 23:59:02', '2018-03-01 00:01:31',..."
4,4,"['f8732ae527', 'f8732ae527', 'f8732ae527']",06437cb07a,"['2018-02-28 23:59:02', '2018-03-01 00:01:02',..."


In [None]:
# Apply transformation
X_transformed = preprocessor.fit_transform(df.drop(columns=target_col))
X = pd.DataFrame(X_transformed, columns=preprocessor.get_feature_names_out())


In [None]:
# Target variable
y = df["Credit_Score"]

In [None]:
# Define model
model1 = RandomForestClassifier(random_state=42)

In [None]:
# Apply Recursive Feature Elimination (RFE)
rfe = RFE(estimator = model1, n_features_to_select=2)
X_selected = rfe.fit_transform(X, y)

In [None]:
# Get selected features
selected_features = X.columns[rfe.support_]
print(f"Selected Features: {selected_features.tolist()}")

In [None]:
# Define model
model2 = RandomForestClassifier(random_state=42)

In [None]:
# Train a random forest model
model1.fit(X, y)

In [None]:
# Get feature importance scores
feature_importances = model1.feature_importances_

# Rank features
feature_ranking = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

In [None]:
# Display top features
print("Feature Importances:")
print(feature_ranking)