In [2]:
import re
import warnings
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from fuzzywuzzy import fuzz
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score, log_loss
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from xgboost import XGBClassifier



# Model Training

Given the large size of the dataset along with computational and time constraints, we will follow a phased approach for model training:

1. **Initial Training with 100,000 Samples:**
   - **Objective:** Identify the best performing models and narrow down the candidate models for further tuning.
   - **Models Trained:**
     - **Linear Model:** Provides a simple baseline to compare more complex models against.
     - **Random Forest:** Robust to overfitting and effective with mixed types of features.
     - **Gradient Boosting:** Efficient at handling large datasets and provides strong predictive performance.
     - **XGBoost:** Known for its speed and performance, particularly in large datasets.
     - **Neural Network with TensorFlow:** Capable of capturing complex patterns in the data.
     - **Support Vector Machine (SVM):** Effective in high-dimensional spaces and for binary classification tasks.
     
     

2. **Intermediate Training with 200,000 Samples:**
   - **Objective:** Conduct hyperparameter tuning on the best performing models from the initial phase.
   - **Models Trained:**
     - **XGBoost:**
     - **Optimized Simple Neural Network:**
     - **Random Forest:** (Note: Training terminated as it exceeds 1.5 hour.)
     
     

3. **Final Training with Complete Data (404,290 Samples):**
   - **Objective:** Train the best model with optimized hyperparameters on the full dataset to achieve the highest possible accuracy.
   - **Models Trained:**
     - **XGBoost (optimized):**
     - **Neural Network (hypertuned with KerasTuner):**


In [3]:
df_train = pd.read_csv(r"train_cleaned.csv")


In [8]:
df_train = df.drop(columns=["first_word_match", "id","common_stopwords","stopword_ratio","stopword_ratio","len_diff","common_words","total_words","common_chars","char_overlap"])

In [9]:
#there seemes to be a issue when savings as csv
df_train['combined_vec'] = df_train['combined_vec'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))

  df_train['combined_vec'] = df_train['combined_vec'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))


# Tranning the model

In [54]:
#df_train1 = df_train.head(10000)
df_train1 = df_train

In [55]:
# Preparing the data
X_combined_vec = np.vstack(df_train1['combined_vec'].values)
X_features = df_train1[["last_word_match", "fuzz_ratio"]].values
X = np.hstack([X_combined_vec, X_features])
y = df_train1['is_duplicate'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


###  Function to evaluate and display model performance


In [56]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    loss = log_loss(y_test, y_pred_proba)
    report = classification_report(y_test, y_pred)
    
    print(f'{model.__class__.__name__} Accuracy: {accuracy:.2f}')
    print(f'{model.__class__.__name__} F1 Score: {f1:.2f}')
    print(f'{model.__class__.__name__} Log Loss: {loss:.2f}')
    print(f'{model.__class__.__name__} Classification Report:')
    print(report)
    print('-' * 60)

### Baseline Linear Model 

In [26]:
logistic_regression = LogisticRegression(max_iter=5000, random_state=42)
evaluate_model(logistic_regression, X_train, y_train, X_test, y_test)

LogisticRegression Accuracy: 0.73
LogisticRegression F1 Score: 0.73
LogisticRegression Log Loss: 0.52
LogisticRegression Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.79     12456
           1       0.67      0.57      0.62      7544

    accuracy                           0.73     20000
   macro avg       0.72      0.70      0.71     20000
weighted avg       0.73      0.73      0.73     20000

------------------------------------------------------------


### Random Forest :

In [18]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
evaluate_model(rf_classifier, X_train, y_train, X_test, y_test)

RandomForestClassifier Accuracy: 0.77
RandomForestClassifier F1 Score: 0.76
RandomForestClassifier Log Loss: 0.47
RandomForestClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.90      0.83     12456
           1       0.77      0.56      0.65      7544

    accuracy                           0.77     20000
   macro avg       0.77      0.73      0.74     20000
weighted avg       0.77      0.77      0.76     20000

------------------------------------------------------------


In [28]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
evaluate_model(gb_classifier, X_train, y_train, X_test, y_test)

GradientBoostingClassifier Accuracy: 0.74
GradientBoostingClassifier F1 Score: 0.74
GradientBoostingClassifier Log Loss: 0.51
GradientBoostingClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80     12456
           1       0.67      0.61      0.64      7544

    accuracy                           0.74     20000
   macro avg       0.72      0.71      0.72     20000
weighted avg       0.74      0.74      0.74     20000

------------------------------------------------------------


In [17]:
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42, tree_method='gpu_hist')
evaluate_model(xgb_classifier, X_train, y_train, X_test, y_test)


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



XGBClassifier Accuracy: 0.77
XGBClassifier F1 Score: 0.77
XGBClassifier Log Loss: 0.46
XGBClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82     12456
           1       0.71      0.67      0.69      7544

    accuracy                           0.77     20000
   macro avg       0.76      0.75      0.75     20000
weighted avg       0.77      0.77      0.77     20000

------------------------------------------------------------


In [None]:
svm_classifier = SVC(kernel='rbf', probability=True, random_state=42)
evaluate_model(svm_classifier, X_train, y_train, X_test, y_test)

### Simple Nuaral Network with tensorflow 

In [24]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m 106/8086[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 1ms/step - accuracy: 0.5851 - loss: 0.6361 

I0000 00:00:1719075275.598689     318 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.6633 - loss: 0.5591 - val_accuracy: 0.6723 - val_loss: 0.5523
Epoch 2/10
[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.6675 - loss: 0.5537 - val_accuracy: 0.6723 - val_loss: 0.5521
Epoch 3/10
[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.6682 - loss: 0.5536 - val_accuracy: 0.6724 - val_loss: 0.5514
Epoch 4/10
[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.6692 - loss: 0.5522 - val_accuracy: 0.6724 - val_loss: 0.5522
Epoch 5/10
[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6674 - loss: 0.5533 - val_accuracy: 0.6724 - val_loss: 0.5525
Epoch 6/10
[1m8086/8086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.6677 - loss: 0.5534 - val_accuracy: 0.6724 - val_loss: 0.5515
Epoch 7/10
[1m8086/8

<keras.src.callbacks.history.History at 0x7f24918e31f0>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

f1 = f1_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Log Loss: {logloss:.2f}")

[1m2527/2527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7764 - loss: 0.4508
[1m2527/2527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Accuracy: 0.78
F1 Score: 0.70
Log Loss: 0.45


We will Start Hypertunning and train XGBClassifier, Random Forest and simple nural network

In [41]:
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb_classifier = XGBClassifier(tree_method='gpu_hist', random_state=42, use_label_encoder=False)

xgb_cv = GridSearchCV(xgb_classifier, xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_cv.fit(X_train, y_train)

best_xgb = xgb_cv.best_estimator_

y_pred_xgb = best_xgb.predict(X_test)
y_pred_proba_xgb = best_xgb.predict_proba(X_test)

xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_loss = log_loss(y_test, y_pred_proba_xgb)

# Display results
print("XGBoost Classifier with Hyperparameter Tuning")
print(f"Best Hyperparameters: {xgb_cv.best_params_}")
print(f"Accuracy: {xgb_accuracy:.2f}")
print(f"F1 Score: {xgb_f1:.2f}")
print(f"Log Loss: {xgb_loss:.2f}")
print('Classification Report:')
print(classification_report(y_test, y_pred_xgb))


  pid = os.fork()
  pid = os.fork()

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the devic

XGBoost Classifier with Hyperparameter Tuning
Best Hyperparameters: {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 0.77
F1 Score: 0.69
Log Loss: 0.46
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.84      0.82     12456
           1       0.71      0.66      0.69      7544

    accuracy                           0.77     20000
   macro avg       0.76      0.75      0.75     20000
weighted avg       0.77      0.77      0.77     20000



In [42]:
df_train1 = df_train

Performed the testing on the complete dataset

The Randomn forster model gave not result even after a hour hence was disused, where the neural network keeps giving worse result with the complete and more advance code.

In [41]:

# Define a simple neural network
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)

history = model.fit(
    X_train, y_train,
    epochs=1000,  
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr]
)

loss, accuracy = model.evaluate(X_test, y_test)
y_pred = (model.predict(X_test) > 0.5).astype("int32")
f1 = f1_score(y_test, y_pred)

print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')
print(f'Test F1 Score: {f1}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.6673 - loss: 0.6188 - val_accuracy: 0.7323 - val_loss: 0.5078 - learning_rate: 0.0010
Epoch 2/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7237 - loss: 0.5243 - val_accuracy: 0.7426 - val_loss: 0.5007 - learning_rate: 0.0010
Epoch 3/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7271 - loss: 0.5161 - val_accuracy: 0.7513 - val_loss: 0.4977 - learning_rate: 0.0010
Epoch 4/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7312 - loss: 0.5118 - val_accuracy: 0.7511 - val_loss: 0.5075 - learning_rate: 0.0010
Epoch 5/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7336 - loss: 0.5072 - val_accuracy: 0.7511 - val_loss: 0.5019 - learning_rate: 0.0010
Epoch 6/100
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━


KeyboardInterrupt



In [45]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)

history = model.fit(
    X_train, y_train,
    epochs=150,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.6797 - loss: 0.5889 - val_accuracy: 0.7491 - val_loss: 0.4921 - learning_rate: 0.0010
Epoch 2/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7374 - loss: 0.5083 - val_accuracy: 0.7547 - val_loss: 0.4851 - learning_rate: 0.0010
Epoch 3/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7454 - loss: 0.4985 - val_accuracy: 0.7566 - val_loss: 0.4785 - learning_rate: 0.0010
Epoch 4/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7496 - loss: 0.4925 - val_accuracy: 0.7609 - val_loss: 0.4776 - learning_rate: 0.0010
Epoch 5/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7529 - loss: 0.4872 - val_accuracy: 0.7657 - val_loss: 0.4750 - learning_rate: 0.0010
Epoch 6/150
[1m4043/4043[0m [32m━━━━━━━━━━━━━━━━━━━

In [47]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


[1m2527/2527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7804 - loss: 0.4509
Test Loss: 0.45165759325027466
Test Accuracy: 0.7794528603553772


In [48]:
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch

class MyHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Dense(
            units=hp.Int('units1', min_value=32, max_value=256, step=32),
            activation='relu',
            input_shape=(X_train.shape[1],)
        ))
        model.add(Dropout(hp.Float('dropout1', 0.1, 0.5, step=0.1)))
        
        model.add(Dense(
            units=hp.Int('units2', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        model.add(Dropout(hp.Float('dropout2', 0.1, 0.5, step=0.1)))

        model.add(Dense(
            units=hp.Int('units3', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        model.add(Dropout(hp.Float('dropout3', 0.1, 0.5, step=0.1)))

        model.add(Dense(1, activation='sigmoid'))

        model.compile(
            optimizer=tf.keras.optimizers.Adam(
                hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
            ),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        return model

hypermodel = MyHyperModel()


  from kerastuner import HyperModel


In [49]:
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,  
    executions_per_trial=2,  
    directory='my_dir',
    project_name='hyperparameter_tuning'
)

tuner.search_space_summary()

tuner.search(X_train, y_train,
             epochs=50,
             batch_size=64,
             validation_split=0.2,
             callbacks=[early_stopping, reduce_lr])

best_model = tuner.get_best_models(num_models=1)[0]

loss, accuracy = best_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
print(best_hyperparameters.values)


Trial 10 Complete [00h 01m 49s]
val_accuracy: 0.7585217952728271

Best val_accuracy So Far: 0.77070352435112
Total elapsed time: 00h 18m 00s


  saveable.load_own_variables(weights_store.get(inner_path))


[1m  96/2527[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 2ms/step - accuracy: 0.7778 - loss: 0.4468 

W0000 00:00:1719078710.274604     317 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2527/2527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7699 - loss: 0.4578
Test Loss: 0.45768284797668457
Test Accuracy: 0.7697197794914246
{'units1': 224, 'dropout1': 0.1, 'units2': 96, 'dropout2': 0.1, 'units3': 64, 'dropout3': 0.30000000000000004, 'learning_rate': 0.001}


# This is the Final model we have chosen to use to perform on test.csv

In [44]:
best_params = {
    'colsample_bytree': 0.7,
    'learning_rate': 0.2,
    'max_depth': 5,
    'n_estimators': 200,
    'subsample': 1.0
}

xgb_classifier = XGBClassifier(
    colsample_bytree=best_params['colsample_bytree'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    tree_method='gpu_hist',
    random_state=42,
    use_label_encoder=False
)

xgb_classifier.fit(X_train, y_train)

y_pred_xgb = xgb_classifier.predict(X_test)
y_pred_proba_xgb = xgb_classifier.predict_proba(X_test)

# Evaluate the model
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_loss = log_loss(y_test, y_pred_proba_xgb)


model_filename = 'best.pkl'
joblib.dump(xgb_classifier, model_filename)



evaluate_model(xgb_classifier, X_train, y_train, X_test, y_test)



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



XGBClassifier Accuracy: 0.79
XGBClassifier F1 Score: 0.78
XGBClassifier Log Loss: 0.44
XGBClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83     50803
           1       0.72      0.68      0.70     30055

    accuracy                           0.79     80858
   macro avg       0.77      0.76      0.77     80858
weighted avg       0.78      0.79      0.78     80858

------------------------------------------------------------
