In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
import numpy as np

from sklearn.linear_model import LogisticRegression
import xgboost as xgb

import tensorflow as tf

In [2]:
def read_csv():
    df = pd.read_csv("../../ts-feature-engineering/data/output/features.csv")

    df['normalized_label'] = (df['label']
                              .str.lower()
                              .str.strip()
                              .str.replace(r"(easy match|hard match)", "1", regex=True)
                              .str.replace(r"(easy non-match|hard non-match)", "0", regex=True)
    )

    df.loc[:, "label_numeric"] = df["normalized_label"].astype(int)
    return df 

def prepare_data_for_model(df):
    features = df[["first_name_similarity", 'last_name_similarity', "first_name_length_diff", "last_name_length_diff", "role_similarity", "context_similarity"]]
    labels = df['label_numeric']  # Assuming 'label_numeric' is already in your DataFrame
    return features, labels

def train_xgboost_model(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1],
    }
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    print("Best Parameters:", grid_search.best_params_)
    
    return best_model

### random forest
def train_rf_model(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    print("Best Parameters:", grid_search.best_params_)
    
    return best_model

## logistic regression
def train_lr_model(X_train, y_train):
    # Hyperparameters for logistic regression can include regularization strength and type
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
        'penalty': ['l1', 'l2'],  # Norm for the penalization
        'solver': ['liblinear', 'saga']  # Algorithm to use in the optimization problem
    }
    model = LogisticRegression(random_state=42, max_iter=1000)  # Increased max_iter for convergence
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    print("Best Parameters:", grid_search.best_params_)
    
    return best_model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


# Main workflow
df = read_csv()  # Make sure to replace with actual function to read your DataFrame
df
X, y = prepare_data_for_model(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

trained_model = train_lr_model(X_train, y_train)
evaluate_model(trained_model, X_test, y_test)

with open('../data/output/trained_lr_model.pkl', 'wb') as f:
    pickle.dump(trained_model, f)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=   0.0s
[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=   0.0s
[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=   0.0s
[CV] END ....................C=0.01, penalty=l1



In [3]:
# import pandas as pd
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense, concatenate
# from tensorflow.keras.models import Model
# from sklearn.model_selection import train_test_split

# def read_csv():
#     df = pd.read_csv("../../ts-feature-engineering/data/output/features.csv")

#     df['normalized_label'] = (df['label']
#                               .str.lower()
#                               .str.strip()
#                               .str.replace(r"(easy match|hard match)", "1", regex=True)
#                               .str.replace(r"(easy non-match|hard non-match)", "0", regex=True)
#                              )

#     df.loc[:, "label_numeric"] = df["normalized_label"].astype(int)
#     return df 

# def prepare_data_for_model(df):
#     features = df[["first_name_length_diff", "last_name_length_diff", "role_similarity", "context_similarity"]]
#     additional_features = df[["first_name_similarity", "last_name_similarity"]]
#     labels = df['label_numeric']
#     return features, additional_features, labels

# def build_model(input_shape, additional_input_shape):
#     # Main feature input
#     feature_inputs = Input(shape=(input_shape,), name='features')
#     # Additional inputs for first_name_similarity and last_name_similarity
#     first_name_input = Input(shape=(1,), name='first_name_similarity')
#     last_name_input = Input(shape=(1,), name='last_name_similarity')

#     # Combine all inputs
#     combined_inputs = concatenate([feature_inputs, first_name_input, last_name_input])

#     # Follow your model architecture
#     x = Dense(64, activation='relu')(combined_inputs)
#     x = Dense(64, activation='relu')(x)
#     outputs = Dense(1, activation='sigmoid')(x)

#     model = Model(inputs=[feature_inputs, first_name_input, last_name_input], outputs=outputs)
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# def scale_additional_features(additional_features, first_name_weight=1.0, last_name_weight=2.0):
#     # Apply custom weights
#     additional_features['first_name_similarity'] *= first_name_weight
#     additional_features['last_name_similarity'] *= last_name_weight
#     return additional_features

# # Main workflow
# df = read_csv()

# features, additional_features, labels = prepare_data_for_model(df)

# additional_features_scaled = scale_additional_features(additional_features.copy())

# # Splitting the data
# X_train, X_test, additional_train, additional_test, y_train, y_test = train_test_split(
#     features, additional_features_scaled, labels, test_size=0.2, random_state=42)

# # Combine the features and additional features for training and testing
# X_train_combined = [X_train, additional_train.iloc[:, 0], additional_train.iloc[:, 1]]
# X_test_combined = [X_test, additional_test.iloc[:, 0], additional_test.iloc[:, 1]]

# accuracy_scores = []
# for i in range(10):  # Run the model 10 times
#     model = build_model(input_shape=features.shape[1], additional_input_shape=1)
#     model.fit(X_train_combined, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)  # Set verbose to 0 to reduce output
#     test_loss, test_acc = model.evaluate(X_test_combined, y_test, verbose=0)  # Set verbose to 0 to reduce output
#     accuracy_scores.append(test_acc)
#     print(f"Run {i+1}, Test Accuracy: {test_acc}")

# average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
# print(f"Average Test Accuracy over 10 runs: {average_accuracy}")

# # Save the model
# model.save('../data/output/trained_tensor_model')