# Loading the Important Libaries 

In [20]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

# For Model Building
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

# Loading Datasets

In [21]:
# Loading the data into the Environment# Loading Data 

train_data = pd.read_csv("train_data.csv")

test_data = pd.read_csv("test_data.csv")

assembly_line_info = pd.read_csv("assembly_line_info.csv")

issue_info = pd.read_csv("issue_info.csv")

log_report_type_data = pd.read_csv("log_report_type_data.csv")

car_variant_data = pd.read_csv("car_variant_data.csv")

## Merging Datasets

In [22]:
# Merging different dataset and creating the final train dataset.

train_data= train_data.drop_duplicates(subset=['id'])
assembly_line_info= assembly_line_info.drop_duplicates(subset=['id'])

# Merging train data and assembly data
df1 = pd.merge(train_data,assembly_line_info, how = 'left',on = 'id') 
issue_info= issue_info.drop_duplicates(subset=['id'])

# Merging issue info
df2 = pd.merge(df1,issue_info, how='left', on = 'id')
log_report_type_data= log_report_type_data.drop_duplicates(subset=['id'])

# Merging log report type
df3 = pd.merge(df2,log_report_type_data, how='left',on = 'id')
car_variant_data= car_variant_data.drop_duplicates(subset=['id'])

# Merging car variant 
data = pd.merge(df3, car_variant_data, how='left',on = 'id')

## Keeping only the numeric part of the categorical attributes.

In [23]:
def df_numeric(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            df[c] = df[c].str.extract('([0-9]+)',expand=False)
    return df

df_numeric(data)

data.head()

Unnamed: 0,id,factory_number,downtime_duration,assembly_line_type,issue_type,log_report_type,volume,car_variant
0,13366,415,1,2,4,312,1,35
1,6783,474,0,2,2,312,2,35
2,9519,931,1,8,2,203,3,15
3,10202,700,1,8,1,54,1,11
4,4555,600,2,8,2,82,30,15


In [24]:
# Splitting my target and independent features 

X = data.drop(['downtime_duration','id'],axis = 1)
y = data['downtime_duration']

## Encoding Categorical Features with Target Encoding

In [25]:
import category_encoders as TR
New_X = TR.TargetEncoder().fit_transform(X,y)

In [26]:
New_X.head()

Unnamed: 0,factory_number,assembly_line_type,issue_type,log_report_type,volume,car_variant
0,0.793708,0.215511,0.127451,0.158621,1,0.136528
1,2e-05,0.215511,0.328696,0.158621,2,0.136528
2,0.482759,0.65059,0.328696,1.269481,3,0.868077
3,0.793708,0.65059,0.616296,0.6,1,0.526923
4,1.090909,0.65059,0.328696,0.880808,30,0.868077


## Standardizing 'volume' feature

In [27]:
New_X = New_X.values 
std = StandardScaler()
New_X = std.fit_transform(New_X)
New_X = pd.DataFrame(New_X)

## Train/Test Split

In [28]:
X_train,X_test,y_train,y_test = train_test_split(New_X,y,test_size = 0.3,random_state = 123) 

In [29]:
y_train = y_train.astype('category')
y_test = y_test.astype('category')

## Model Building

In [30]:
# Decision Tree Classifier

DecisionTree = DecisionTreeClassifier()

# SVM

SVM = SVC()

# Random Forest Classifier

RandomForest = RandomForestClassifier()

# Grdient Boosting

GradientBoosting = GradientBoostingClassifier()

# Ada Boosting 

AdaBoosting = AdaBoostClassifier()

#XGBoost 

XGB_Classifier = XGBClassifier()



models = [DecisionTree,SVM,RandomForest,GradientBoosting,AdaBoosting,XGB_Classifier]

lf = pd.DataFrame(columns=["MODEL","Train-F1SCORE", "Test-F1SCORE"])
for i in range(len(models)):
    

    clf = models[i]
    clf = clf.fit(X_train, y_train)
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    f1_train =f1_score(y_train, y_train_predicted,average = 'macro')
    f1_test =f1_score(y_test, y_test_predicted,average = 'macro')
       

    list1 = [clf.__class__.__name__,f1_train,f1_test]

    lf.loc[i] = list1



In [31]:
lf

Unnamed: 0,MODEL,Train-F1SCORE,Test-F1SCORE
0,DecisionTreeClassifier,0.955897,0.577753
1,SVC,0.625346,0.652082
2,RandomForestClassifier,0.95669,0.635143
3,GradientBoostingClassifier,0.694095,0.665945
4,AdaBoostClassifier,0.612052,0.611665
5,XGBClassifier,0.876292,0.640814


# Random Forest Grid Search 

In [32]:
gkf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X=X_train, y=y_train)

param_grid = {"n_estimators" : [50, 100,200,500],
              "max_depth" : [1,5,6,7],
              "max_features" : [3, 5],
              "min_samples_leaf" : [1, 2, 4]}

RFmodel = RandomForestClassifier()

rf_grid = GridSearchCV(RFmodel, param_grid, cv=gkf,verbose = 0, n_jobs = -1)

In [33]:
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x000001DEE97E5CF0>,
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                             

In [34]:
train_pred = rf_grid.predict(X_train)
test_pred = rf_grid.predict(X_test)

f1_train =f1_score(y_train, train_pred,average = 'macro')
f1_test =f1_score(y_test, test_pred,average = 'macro')

print(f1_train)
print(f1_test)

0.6999877086385822
0.6586018174006973


# XG Boost Grid Search 

In [35]:
gkf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X=X_train, y=y_train)


param_grid = {
        'learning_rate': [0.01, 0.1,0.001],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
    }

In [36]:
XG = XGBClassifier(objective= 'multi:softmax')

xg_grid = GridSearchCV(XG,param_grid, cv=gkf,verbose = 0,n_jobs = -1)

In [37]:
xg_grid.fit(X_train, y_train)



GridSearchCV(cv=<generator object _BaseKFold.split at 0x000001DEE9D13EB0>,
             error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weigh...
                                     use_label_encoder=True,
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.5, 0.7],
                         'learning_rate': [0.01, 0.1, 0.001],
                         'max_depth': [3,

In [38]:
train_pred = xg_grid.predict(X_train)
test_pred = xg_grid.predict(X_test)


f1_train =f1_score(y_train, train_pred,average = 'macro')
f1_test =f1_score(y_test, test_pred,average = 'macro')

print(f1_train)
print(f1_test)

0.7104316814721017
0.6503132655233907


# Gradient Boosting Grid Search 

In [39]:
gkf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X=X_train, y=y_train)


param_grid = {
        'learning_rate': [ 0.1],
        'max_depth': [3, 5, 6,7,8, 10],
        'subsample': [0.5, 0.7],
        'n_estimators' : [100, 200, 500,1000],
    }

In [40]:
GB = GradientBoostingClassifier()

gb_grid = GridSearchCV(GB,param_grid, cv=2,verbose = 0,n_jobs = -1)

In [41]:
gb_grid.fit(X_train, y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...=None,
           

In [42]:
train_pred = gb_grid.predict(X_train)
test_pred = gb_grid.predict(X_test)


f1_train =f1_score(y_train, train_pred,average = 'macro')
f1_test =f1_score(y_test, test_pred,average = 'macro')

print(f1_train)
print(f1_test)

0.6993212328134302
0.6632401647301333
