In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from operator import itemgetter
import statsmodels.api as sm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
data1 = pd.read_csv('only_label.csv')

In [3]:
data1.columns.to_list()

['Customer_care_calls',
 'Customer_rating',
 'Cost_of_the_Product',
 'Prior_purchases',
 'Discount_offered',
 'Weight_in_gms',
 'Product_importance_labeled',
 'Warehouse_block_A',
 'Warehouse_block_B',
 'Warehouse_block_C',
 'Warehouse_block_D',
 'Warehouse_block_F',
 'Gender_F',
 'Gender_M',
 'Mode_of_Shipment_Flight',
 'Mode_of_Shipment_Road',
 'Mode_of_Shipment_Ship',
 'is_shipping_late']

# pemisahan data target dan fitur

In [4]:
X = data1.drop(columns='is_shipping_late')
y = data1['is_shipping_late']

# split data dan deklarasi model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
logreg = LogisticRegression(random_state=1, max_iter=150)

# feature selection

In [12]:
n_features_to_select = 10
rfe = RFE(logreg, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

from operator import itemgetter
features = X_train.columns.to_list()
for i, j in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
    print(i, j)

1 Customer_care_calls
1 Discount_offered
1 Product_importance_labeled
1 Warehouse_block_A
1 Warehouse_block_B
1 Warehouse_block_D
1 Gender_M
1 Mode_of_Shipment_Flight
1 Mode_of_Shipment_Road
1 Mode_of_Shipment_Ship


In [13]:
temp = ['Customer_care_calls','Discount_offered','Product_importance_labeled',
        'Warehouse_block_A','Warehouse_block_B','Warehouse_block_D','Gender_M',
        'Mode_of_Shipment_Flight','Mode_of_Shipment_Road','Mode_of_Shipment_Ship']
X = data1[temp]
y = data1.is_shipping_late

In [14]:
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.556884
         Iterations 8
                              Results: Logit
Model:                 Logit               Pseudo R-squared:    0.174     
Dependent Variable:    is_shipping_late    AIC:                 12270.3412
Date:                  2022-02-21 15:47    BIC:                 12343.3968
No. Observations:      10999               Log-Likelihood:      -6125.2   
Df Model:              9                   LL-Null:             -7417.0   
Df Residuals:          10989               LLR p-value:         0.0000    
Converged:             1.0000              Scale:               1.0000    
No. Iterations:        8.0000                                             
--------------------------------------------------------------------------
                            Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
--------------------------------------------------------------------------
Customer_care_calls        -

In [15]:
temp = ['Discount_offered','Mode_of_Shipment_Flight','Mode_of_Shipment_Road',
        'Mode_of_Shipment_Ship']
final_X = data1[temp]
final_y = data1.is_shipping_late

In [16]:
logit_model=sm.Logit(final_y,final_X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.557109
         Iterations 8
                             Results: Logit
Model:                 Logit              Pseudo R-squared:   0.174     
Dependent Variable:    is_shipping_late   AIC:                12263.2838
Date:                  2022-02-21 15:48   BIC:                12292.5061
No. Observations:      10999              Log-Likelihood:     -6127.6   
Df Model:              3                  LL-Null:            -7417.0   
Df Residuals:          10995              LLR p-value:        0.0000    
Converged:             1.0000             Scale:              1.0000    
No. Iterations:        8.0000                                           
------------------------------------------------------------------------
                         Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------------
Discount_offered         0.1257   0.0043  29.0741 0

In [17]:
X_train, X_test, y_train, y_test = train_test_split(final_X, final_y, test_size=.3,  random_state=0)
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=150, random_state=1)

In [18]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [53]:
# define models and parameters
model = LogisticRegression(random_state=1)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [54]:
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.623503 using {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
0.623503 (0.015158) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.623503 (0.015158) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.623503 (0.015158) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.623503 (0.015158) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'lbfgs'}
0.623503 (0.015158) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

750 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\zakik\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\zakik\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\zakik\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

----------------------

In [55]:
grid_result.best_params_

{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [56]:
grid_result.best_estimator_

LogisticRegression(C=100, penalty='l1', random_state=1, solver='liblinear')

In [59]:
final_model = LogisticRegression(C=100, penalty='l1', random_state=1, solver='liblinear')
final_model.fit(X_train, y_train)

LogisticRegression(C=100, penalty='l1', random_state=1, solver='liblinear')

In [60]:
predict = final_model.predict(X_train)

In [61]:
accuracy_score(y_train, predict)

0.6235874788933627

In [62]:
predict = final_model.predict(X_test)

In [63]:
accuracy_score(y_test, predict)

0.603030303030303