In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import plotly.graph_objs as go
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [54]:
df = pd.read_csv('full_weaving_dataset.csv').dropna(
    subset=['warp_count', 'weft_count', 'epi', 'ppi', 'Req_Finish_Fabrics', 'Total_pdn_per_order']
)

In [55]:
df.head(60)

Unnamed: 0,ID,Month,Construction,Req_Finish_Fabrics,Fabric_Allowance,Rec_Beam_length(yds),Shrink_allow,act_shrink%,Previous_pdn,Req_grey_fabric,Req_beam_length(yds),Total_pdn_m/c,Rej_and_cut_Piece,Total_pdn_per_order,warp_count,weft_count,epi,ppi
0,12207-8,January,40+40/2/40/110x80,31300,6.0,5752.336,12.5,12.26173158,5047,33297.87234,34797.6511,5047.0,0,0,double,80,110,80
1,12207-8,January,40+40/2/40/110x80,31300,6.0,5883.568,12.5,64.12041129,1952,33297.87234,34797.6511,2111.0,0,0,double,80,110,80
2,12207-8,January,40+40/2/40/110x80,31300,6.0,3094.888,12.5,24.13295732,2207,33297.87234,34797.6511,2348.0,0,0,double,80,110,80
3,12207-8,January,40+40/2/40/110x80,31300,6.0,5894.504,12.5,14.734132,5026,33297.87234,34797.6511,5026.0,0,0,double,80,110,80
4,12207-8,January,40+40/2/40/110x80,31300,6.0,5850.76,12.5,21.46319453,4391,33297.87234,34797.6511,4595.0,0,0,double,80,110,80
5,12207-8,January,40+40/2/40/110x80,31300,6.0,5905.44,12.5,22.69839335,4340,33297.87234,34797.6511,4565.0,0,0,double,80,110,80
6,12207-8,January,40+40/2/40/110x80,31300,6.0,5905.44,12.5,33.04478582,3754,33297.87234,34797.6511,3954.0,0,0,double,80,110,80
7,12207-8,January,40+40/2/40/110x80,31300,6.0,38286.936,12.5,27.79260268,TOTAL,33297.87234,34797.6511,27646.0,285,27982,double,80,110,80
8,12228-1,January,40x40/110x90,10450,7.0,1443.552,14.5,9.80581233,1302,11236.55914,12017.35043,1302.0,0,0,40,40,110,90
9,12228-1,January,40x40/110x90,10450,7.0,1749.76,14.5,16.33138259,1464,11236.55914,12017.35043,1464.0,0,0,40,40,110,90


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121148 entries, 0 to 121147
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ID                    121148 non-null  object 
 1   Month                 121148 non-null  object 
 2   Construction          121148 non-null  object 
 3   Req_Finish_Fabrics    121148 non-null  int64  
 4   Fabric_Allowance      121148 non-null  float64
 5   Rec_Beam_length(yds)  121148 non-null  float64
 6   Shrink_allow          121148 non-null  float64
 7   act_shrink%           121148 non-null  object 
 8   Previous_pdn          107511 non-null  object 
 9   Req_grey_fabric       121148 non-null  float64
 10  Req_beam_length(yds)  121148 non-null  float64
 11  Total_pdn_m/c         107518 non-null  float64
 12  Rej_and_cut_Piece     121148 non-null  int64  
 13  Total_pdn_per_order   121148 non-null  int64  
 14  warp_count            121148 non-null  object 
 15  

In [57]:
# Drop rows with invalid or missing values in key fields
# The 'act_shrink%' column has string 'na' for missing values, drop those rows
df = df[df['act_shrink%'] != 'na'].copy()
df['act_shrink%'] = df['act_shrink%'].astype(float)

In [58]:
# Drop remaining rows with any missing numeric fields
df = df.dropna(subset=['Previous_pdn', 'Total_pdn_m/c'])

In [102]:
df['Rejected'] = (
    (df['act_shrink%'] > 20) | 
    (df['Rej_and_cut_Piece'] > df['Total_pdn_per_order']*0.10)
).astype(int)

In [104]:
df.tail(60)

Unnamed: 0,ID,Month,Construction,Req_Finish_Fabrics,Fabric_Allowance,Rec_Beam_length(yds),Shrink_allow,act_shrink%,Previous_pdn,Req_grey_fabric,Req_beam_length(yds),Total_pdn_m/c,Rej_and_cut_Piece,Total_pdn_per_order,warp_count,weft_count,epi,ppi,Rejected
121046,SF-13244,September,40x40/130x100,83449,7.0,109.36,10.2,35.076811,71,89730.10753,91369.94215,71.0,0,0,40,40,130,100,1
121048,SF-13244,September,40x40/130x100,83449,7.0,109.36,10.2,35.076811,TOTAL,89730.10753,91369.94215,71.0,0,70,40,40,130,100,1
121049,SF-13245,September,40x40/130x100,83449,7.0,109.36,10.2,37.820044,68,89730.10753,91369.94215,68.0,0,0,40,40,130,100,1
121051,SF-13245,September,40x40/130x100,83449,7.0,109.36,10.2,37.820044,TOTAL,89730.10753,91369.94215,68.0,0,68,40,40,130,100,1
121052,SF-13246,September,40x40/130x90,83449,7.0,114.828,10.2,32.072317,78,89730.10753,91369.94215,78.0,0,0,40,40,130,90,1
121054,SF-13246,September,40x40/130x90,83449,7.0,114.828,10.2,32.072317,TOTAL,89730.10753,91369.94215,78.0,2,71,40,40,130,90,1
121055,SF-13248,September,40x40/130x100,83449,7.0,103.892,10.2,34.547415,68,89730.10753,91369.94215,68.0,0,0,40,40,130,100,1
121057,SF-13248,September,40x40/130x100,83449,7.0,103.892,10.2,34.547415,TOTAL,89730.10753,91369.94215,68.0,0,66,40,40,130,100,1
121058,SF-13249,September,40x40/130x100,83449,7.0,109.36,10.2,36.905633,69,89730.10753,91369.94215,69.0,0,0,40,40,130,100,1
121060,SF-13249,September,40x40/130x100,83449,7.0,109.36,10.2,36.905633,TOTAL,89730.10753,91369.94215,69.0,0,70,40,40,130,100,1


In [105]:
# The dataset contains multiple rows per batch (ID + Month), with 'TOTAL' row summing each batch.
# We focus on one row per batch by selecting the rows where Previous_pdn == 'TOTAL' (aggregated totals).
df_batches = df[df['Previous_pdn'] == 'TOTAL'].copy()

In [106]:
# Drop unused or redundant columns: ID, Previous_pdn (all 'TOTAL'), and the raw Rej_and_cut_Piece
df_batches = df_batches.drop(columns=['ID','Previous_pdn','Rej_and_cut_Piece'])

In [107]:
# 2. Feature engineering
# At this stage, the dataset has one row per batch. We already have numeric features:
# 'Req_Finish_Fabrics', 'Fabric_Allowance', 'Rec_Beam_length(yds)', 'Shrink_allow', 'act_shrink%', 
# 'Req_grey_fabric', 'Req_beam_length(yds)', 'Total_pdn_m/c', 'Total_pdn_per_order', 'warp_count',
# 'weft_count', 'epi', 'ppi', and the binary 'rejected' label.

# No new features engineered here beyond the preprocessing step above.

# 3. Encode categorical features and prepare training data
# One-hot encode 'Month', 'warp_count', 'weft_count', and 'Construction'
# and drop them after encoding. Also convert numeric columns to proper dtype.
df_batches['act_shrink%'] = df_batches['act_shrink%'].astype(float)

In [108]:
# One-hot encode Month
month_dummies = pd.get_dummies(df_batches['Month'], prefix='Month')
df_batches = pd.concat([df_batches, month_dummies], axis=1)
df_batches = df_batches.drop(columns=['Month'])

In [109]:
# One-hot encode warp_count and weft_count
warp_dummies = pd.get_dummies(df_batches['warp_count'], prefix='warp')
weft_dummies = pd.get_dummies(df_batches['weft_count'], prefix='weft')
df_batches = pd.concat([df_batches, warp_dummies, weft_dummies], axis=1)
df_batches = df_batches.drop(columns=['warp_count','weft_count'])

In [110]:
 #One-hot encode Construction
const_dummies = pd.get_dummies(df_batches['Construction'], prefix='Const')
df_batches = pd.concat([df_batches, const_dummies], axis=1)
df_batches = df_batches.drop(columns=['Construction'])

In [111]:
# Separate features X and target y
X = df_batches.drop(columns=['Rejected'])
y = df_batches['Rejected']

# 4. Train/test split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [112]:
# (a) Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:,1]

# (b) Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

# (c) XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:,1]

Parameters: { "use_label_encoder" } are not used.



In [113]:
# Print evaluation metrics
print("Logistic Regression - Accuracy: {:.3f}, F1: {:.3f}, AUC: {:.3f}".format(
    accuracy_score(y_test, y_pred_lr), 
    f1_score(y_test, y_pred_lr), 
    roc_auc_score(y_test, y_proba_lr)
))
print("Random Forest       - Accuracy: {:.3f}, F1: {:.3f}, AUC: {:.3f}".format(
    accuracy_score(y_test, y_pred_rf), 
    f1_score(y_test, y_pred_rf), 
    roc_auc_score(y_test, y_proba_rf)
))
print("XGBoost             - Accuracy: {:.3f}, F1: {:.3f}, AUC: {:.3f}".format(
    accuracy_score(y_test, y_pred_xgb), 
    f1_score(y_test, y_pred_xgb), 
    roc_auc_score(y_test, y_proba_xgb)
))


Logistic Regression - Accuracy: 0.992, F1: 0.992, AUC: 0.999
Random Forest       - Accuracy: 0.999, F1: 0.999, AUC: 1.000
XGBoost             - Accuracy: 0.999, F1: 0.999, AUC: 1.000


In [114]:
# Choose the best-performing model (here Random Forest) for predictions
best_model = rf

# 6. Apply the best model to the full batch dataset and store predictions
# We re-fit the model on all available data for final use (optional, but done for completeness)
best_model.fit(X, y)
df_batches_orig = df[df['Previous_pdn'] == 'TOTAL'].copy().reset_index(drop=True)
df_batches_orig = df_batches_orig.drop(columns=['ID','Previous_pdn','Rej_and_cut_Piece'])

In [115]:
# Predict rejection probabilities on the full set of batches
proba_all = best_model.predict_proba(X)[:,1]
pred_all = best_model.predict(X)
df_batches_orig['pred_proba'] = proba_all
df_batches_orig['pred_rejected'] = pred_all

In [116]:
df_batches_orig.head(50)


Unnamed: 0,Month,Construction,Req_Finish_Fabrics,Fabric_Allowance,Rec_Beam_length(yds),Shrink_allow,act_shrink%,Req_grey_fabric,Req_beam_length(yds),Total_pdn_m/c,Total_pdn_per_order,warp_count,weft_count,epi,ppi,Rejected,pred_proba,pred_rejected
0,January,40+40/2/40/110x80,31300,6.0,38286.936,12.5,27.792603,33297.87234,34797.6511,27646.0,27982,double,80,110,80,1,1.0,1
1,January,40x40/110x90,10450,7.0,13057.584,14.5,15.61226,11236.55914,12017.35043,11019.0,10659,40,40,110,90,0,0.0,0
2,January,40x40/110x80,900,11.5,1902.864,12.5,19.699989,1016.949153,1062.753843,1528.0,1389,40,40,110,80,0,0.0,0
3,January,40x40/130x80,8000,8.0,10093.928,14.0,11.392275,8695.652174,9245.815159,8944.0,8795,40,40,130,80,0,0.0,0
4,January,50x50/140x70,3500,8.1,69146.1408,9.1,14.142714,3808.487486,3831.158751,59367.0,61001,50,50,140,70,0,0.0,0
5,January,40x40/130x80,1300,11.5,1651.336,14.0,15.583503,1468.926554,1561.863691,1394.0,1326,40,40,130,80,0,0.0,0
6,January,40x40/120x70,1240,13.5,13320.048,11.5,15.946249,1433.526012,1481.166243,11196.0,10997,40,40,120,70,0,0.0,0
7,January,40x40/130x90,264,20.0,2876.168,14.5,20.380173,330.0,352.930607,2290.0,2290,40,40,130,90,1,1.0,1
8,January,40x40/130x80,356,19.0,2477.004,14.5,18.005784,439.506173,470.046002,2031.0,2052,40,40,130,80,0,0.0,0
9,January,40x40/110x90,62800,7.0,291351.444,14.5,17.635555,67526.88172,72219.10116,239970.0,249732,40,40,110,90,0,0.0,0


In [117]:
df_batches_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22021 entries, 0 to 22020
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Month                 22021 non-null  object 
 1   Construction          22021 non-null  object 
 2   Req_Finish_Fabrics    22021 non-null  int64  
 3   Fabric_Allowance      22021 non-null  float64
 4   Rec_Beam_length(yds)  22021 non-null  float64
 5   Shrink_allow          22021 non-null  float64
 6   act_shrink%           22021 non-null  float64
 7   Req_grey_fabric       22021 non-null  float64
 8   Req_beam_length(yds)  22021 non-null  float64
 9   Total_pdn_m/c         22021 non-null  float64
 10  Total_pdn_per_order   22021 non-null  int64  
 11  warp_count            22021 non-null  object 
 12  weft_count            22021 non-null  int64  
 13  epi                   22021 non-null  int64  
 14  ppi                   22021 non-null  int64  
 15  Rejected           