## Notebook to train the models and store them in the models folder

In [29]:
from Utils import *

In [20]:
#path to models directory: (TODO: change to the teacher's requirement)
models_path='$HOME/Datasets/QuoraQuestionPairs/models'

In [21]:
#Load the dataset:
# Create training, validation and test partitions
quora_df = pd.read_csv("$HOME/Datasets/QuoraQuestionPairs/quora_data.csv")
A_df, test_df = sklearn.model_selection.train_test_split(quora_df, test_size=0.05, random_state=123)
train_df, val_df = sklearn.model_selection.train_test_split(A_df, test_size=0.05)
print('train_df.shape=',train_df.shape)

# cast to list taking care of nans:
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))

all_questions = q1_train + q2_train

print(f'lenght of all questions: {len(all_questions)}')

train_df.shape= (291897, 6)
lenght of all questions: 583794


## 1. Train and save model of Teacher for Baseline

In [54]:
#train and save the teacher's baseline with count vectorizer:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

X_tr_q1q2 = get_features_from_df(train_df, count_vectorizer)

logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values
logistic.fit(X_tr_q1q2, y_train)

if not check_model_saved(models_path, "teacher_baseline.pkl"):
    save_model(logistic, models_path, "teacher_baseline.pkl")

if not check_model_saved(models_path, "count_vectorizer.pkl"):
    save_model(count_vectorizer, models_path, "count_vectorizer.pkl")

Model saved to models/teacher_baseline.pkl
Model saved to models/count_vectorizer.pkl


## 2. Train and save model using custom TFIDF vectorizer:

In [36]:
print('fit custom vectorizer on ALL questions...')
custom_vectorizer = TFIDF_Vectorizer()
custom_vectorizer.fit(all_questions)

print('Transforming the question1 and question2 columns from the train data \
by applying the fitted vectorizer\
and putting them in a sparce matrix...')
X_tr_q1q2_tfidf = get_features_from_df_tfidf(train_df, custom_vectorizer)
y_train = train_df["is_duplicate"].values

# regression similoar to bvaseline to see if tfidf makes things better:
logistic2 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
logistic2.fit(X_tr_q1q2_tfidf, y_train)

if not check_model_saved(models_path, "custom_tfidf_logreg.pkl"):
    save_model(logistic2, models_path, "custom_tfidf_logreg.pkl")
if not check_model_saved(models_path, "custom_tfidf_vectorizer.pkl"):
    save_model(custom_vectorizer, models_path, "custom_tfidf_vectorizer.pkl")

fit custom vectorizer on ALL questions...
Transforming the question1 and question2 columns from the train data by applying the fitted vectorizerand putting them in a sparce matrix...
Model saved to models/custom_tfidf_logreg.pkl
Model saved to models/custom_tfidf_vectorizer.pkl


## 3. SVM with K-Fold validation, custom vectorizer and search grid

In [None]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV, KFold
# from sklearn.metrics import accuracy_score

# param_grid = {
#     'C': [0.1],
#     'kernel': ['linear']  # 'linear' often works better with text features
# }

# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# svc = SVC()

# grid_search = GridSearchCV(svc, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# grid_search.fit(X_tr_q1q2_tfidf, y_train)

# # Best model
# best_svm = grid_search.best_estimator_
# print("Best parameters:", grid_search.best_params_)


In [None]:
#save_model(best_svm, models_path, "custom_tfidf_svc.pkl")

## 4. Train and save SVM with features the similarities of LDA and LSI

In [None]:
# import sys
# import scipy
# import scipy.linalg

# if not hasattr(scipy.linalg, 'triu'):
#     def triu(m, k=0):
#         m = np.asanyarray(m)
#         mask = np.tri(m.shape[0], m.shape[1], k=k, dtype=bool)
#         return np.where(mask, m, 0)

#     scipy.linalg.triu = triu



# from gensim import corpora
# from gensim.models import LdaModel, LsiModel


# #use gensim to preprocess all questions(not vectorized)
# all_questions_clean = gensim_preprocess_en(all_questions)

# dictionary = corpora.Dictionary(all_questions_clean)
# corpus = [dictionary.doc2bow(text) for text in all_questions_clean]

# #chosing 1000 topics, although not sure...
# lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=1000)
# lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=1000)

# # apply to the train dataset tp create features:
# q1_train_clean = gensim_preprocess_en(q1_train)
# q2_train_clean = gensim_preprocess_en(q2_train)

# lda_sim_train = compute_similarity_features(q1_train_clean, q2_train_clean, lda, dictionary)
# lsi_sim_train = compute_similarity_features(q1_train_clean, q2_train_clean, lsi, dictionary)

# # save them:
# lda.save(models_path + "/lda_model.gensim")
# lsi.save(models_path + "/lsi_model.gensim")
# dictionary.save(models_path + "/lda_lsi_dictionary.dict")

 
# #combine with custom tfidf vectorization feature:
# from scipy.sparse import hstack

# X_train_combined = hstack((X_tr_q1q2_tfidf, lda_sim_train, lsi_sim_train))


In [48]:
# logistic3 = sklearn.linear_model.LogisticRegression(solver="liblinear",
#                                                    random_state=123)
# logistic3.fit(X_train_combined, y_train)

# save_model(logistic3, models_path, 'logreg_ctfidf_lda_lsi.pkl')

## 4. Train and save logistic regression with character n-gram cosine similarity measure feature

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

# Fit vectorizer once
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 4))
vectorizer.fit(all_questions)


In [11]:
# Compute similarity features
train_df['char_ngram_sim'] = train_df.apply(lambda row: char_ngram_similarity(row['question1'], row['question2'], vectorizer), axis=1)

In [13]:
char_sim_sparse = sparse.csr_matrix(train_df['char_ngram_sim'].values).T  # make it column vector

X_tr_q1q2_bi_tfidf = get_features_from_df_tfidf(train_df, vectorizer)

from scipy.sparse import hstack

X_train_combined = hstack([X_tr_q1q2_bi_tfidf, char_sim_sparse])

In [15]:
# regression similoar to bvaseline to see if tfidf makes things better:
logistic4 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values

logistic4.fit(X_train_combined, y_train)

if not check_model_saved(models_path, "tfidf_4_char_gram_logreg.pkl"):
    save_model(logistic4, models_path, "tfidf_4_char_gram_logreg.pkl")
if not check_model_saved(models_path, "tfidf_vectorizer.pkl"):
    save_model(vectorizer, models_path, "tfidf_vectorizer.pkl")

Model saved to models/tfidf_8_char_gram_logreg.pkl
Model saved to models/tfidf_vectorizer.pkl


## 5. Train and save logistic regression with 4-char similarity and features for start words present

In [24]:
start_words = ['how', 'can', 'what', 'why', 'are', 'do', 'does', 'is', 'should', 'could']

def starts_with_indicator(text):
    text = str(text).strip().lower()
    return {f"starts_with_{word}": int(text.startswith(word)) for word in start_words}

start_feats_q1 = train_df['question1'].apply(lambda x: pd.Series(starts_with_indicator(x)))
start_feats_q1.columns = [f'q1_{col}' for col in start_feats_q1.columns]

start_feats_q2 = train_df['question2'].apply(lambda x: pd.Series(starts_with_indicator(x)))
start_feats_q2.columns = [f'q2_{col}' for col in start_feats_q2.columns]

train_df = pd.concat([train_df, start_feats_q1, start_feats_q2], axis=1)

In [25]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# Select only columns that start with 'q1_' or 'q2_' and contain engineered features
feature_cols = [col for col in train_df.columns if col.startswith(('q1_', 'q2_')) and any(key in col for key in ['starts_with_'])]
numeric_features = train_df[feature_cols]

# Scale numeric features before combining with sparse ones
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(numeric_features)
numeric_sparse = sparse.csr_matrix(numeric_scaled)

# Combine with TF-IDF/other features
X_train_combined = hstack([X_tr_q1q2_bi_tfidf, char_sim_sparse, numeric_sparse])


In [26]:
# regression similoar to bvaseline to see if tfidf makes things better:
logistic5 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values

logistic5.fit(X_train_combined, y_train)


Model saved to models/tfidf_5_char_gram_word_starts_logreg.pkl


In [27]:
if not check_model_saved(models_path, "tfidf_5_char_gram_word_starts_logreg.pkl"):
    save_model(logistic5, models_path, "tfidf_5_char_gram_word_starts_logreg.pkl")

Model saved to models/tfidf_5_char_gram_word_starts_logreg.pkl


In [28]:
if not check_model_saved(models_path, "scaler_tfidf_5_char_gram_word_starts_logreg.pkl"):
    save_model(scaler, models_path, "scaler_tfidf_5_char_gram_word_starts_logreg.pkl")

Model saved to models/scaler_tfidf_5_char_gram_word_starts_logreg.pkl


In [31]:
import pickle
with open(models_path + '/feature_cols_train_start_words.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

## 6. Train and save a model for Jackard similarity feature:

In [32]:
train_df['jaccard_sim'] = train_df.apply(lambda row: jaccard_similarity(row['question1'], row['question2']), axis=1)
print(train_df[['question1', 'question2', 'jaccard_sim']].head())

                                                question1  \
261867  I want to change my branch from mechanical to ...   
2261         How much water should I drink during eating?   
3614    Why is the value of 1GB 1024MB, why is 1000MB ...   
261305  I'm 17 year old my height is 5 feet 4 inch and...   
4879                     Is Zee news a BJP owned channel?   

                                                question2  jaccard_sim  
261867  Should I opt branch change from mechanical to ...     0.304348  
2261                       How much water should I drink?     0.555556  
3614    Where do we use 1 kB = 1000 bytes, 1 MB = 1000...     0.000000  
261305  I'm 15 year old my height is 5 feet 7 inches a...     0.760000  
4879    Why do I get a feeling that Zee News is pro BJ...     0.222222  


In [33]:
X_tr_q1q2_bi_tfidf = get_features_from_df_tfidf(train_df, vectorizer)

X_train_combined = hstack([X_tr_q1q2_bi_tfidf, train_df['jaccard_sim'].values.reshape(-1, 1)])

In [34]:
# regression similoar to bvaseline to see if tfidf makes things better:
logistic6 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values

logistic6.fit(X_train_combined, y_train)

In [40]:
if not check_model_saved(models_path,"tfidf_jackard_logreg.pkl"):
    save_model(logistic6, models_path, "tfidf_jackard_logreg.pkl")

Model saved to models/tfidf_jackard_logreg.pkl


## 7. Custom TFIDF vectorizer and jackard similarity:


In [45]:
custom_vectorizer = load_model(models_path, "custom_tfidf_vectorizer.pkl")

X_tr_q1q2_tfidf = get_features_from_df_tfidf(train_df, custom_vectorizer)

X_train_combined = hstack([X_tr_q1q2_tfidf, train_df['jaccard_sim'].values.reshape(-1, 1)])

Model loaded from models/custom_tfidf_vectorizer.pkl


In [46]:
# regression similoar to bvaseline to see if tfidf makes things better:
logistic7 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values

logistic7.fit(X_train_combined, y_train)

In [47]:
if not check_model_saved(models_path, "custom_tfidf_jackard_logreg.pkl"):
    save_model(logistic7, models_path, "custom_tfidf_jackard_logreg.pkl")

Model saved to models/custom_tfidf_jackard_logreg.pkl


## 8. Train and save word2vec + Logistic Regression simple model

In [None]:
# # Extract targets
# y_train = train_df['is_duplicate'].values
# y_val   = val_df['is_duplicate'].values
# y_test  = test_df['is_duplicate'].values


# # Tokenize for Word2Vec
# tokenized = [text.lower().split() for text in all_questions]

# # 3. Train Word2Vec
# w2v_size = 256
# w2v_model = Word2Vec(sentences=tokenized, vector_size=w2v_size,
#                      window=5, min_count=1, workers=4, seed=42)


# def avg_embedding(text, model, size):
#     words = text.lower().split()
#     vecs = [model.wv[word] for word in words if word in model.wv]
#     if not vecs:
#         return np.zeros(size)
#     return np.mean(vecs, axis=0)

# # Create Word2Vec features for a dataframe
# def create_w2v_features(q1,q2, model, size):
#     emb1 = np.vstack(q1.apply(lambda x: avg_embedding(x, model, size)).values)
#     emb2 = np.vstack(q2.apply(lambda x: avg_embedding(x, model, size)).values)
#     return np.hstack([emb1, emb2])

# X_train_w2v = create_w2v_features(q1_train, q2_train, w2v_model, w2v_size)
# X_val_w2v   = create_w2v_features(q1_val, q2_val, w2v_model, w2v_size)
# X_test_w2v  = create_w2v_features(q1_test, q2_test, w2v_model, w2v_size)

# model_w2v_log = LogisticRegression(random_state=42)
# model_w2v_log.fit(X_train_w2v, train_df['is_duplicate'].values)

In [None]:
if not check_model_saved(models_path, "model_w2v_log.pkl"):
    save_model(model_w2v_log, models_path, "model_w2v_log.pkl")

## 9. Train and save TF-IDF + XGBoost model

In [None]:
# tfidf = TfidfVectorizer(ngram_range=(1,1))
# tfidf.fit(all_questions)

# def create_tfidf_features(q1,q2, vectorizer):
#     v1 = vectorizer.transform(q1)
#     v2 = vectorizer.transform(q2)
#     return sp.hstack([v1, v2])

# X_train_tfidf = create_tfidf_features(q1_train, q2_train, tfidf)
# X_val_tfidf   = create_tfidf_features(q1_val, q2_val, tfidf)
# X_test_tfidf  = create_tfidf_features(q1_test, q2_test, tfidf)

# model_tf_xgb = XGBClassifier(random_state=42)
# model_tf_xgb.fit(X_train_tfidf, train_df['is_duplicate'].values)


In [None]:
if not check_model_saved(models_path, "model_tf_xgb.pkl"):
    save_model(model_tf_xgb, models_path, "model_tf_xgb.pkl")

## 10. Train and save LSTM - siamese model

In [None]:
# q1_train = pd.Series(cast_list_as_strings(list(train_df["question1"])))
# q2_train = pd.Series(cast_list_as_strings(list(train_df["question2"])))
# q1_val = pd.Series(cast_list_as_strings(list(val_df["question1"])))
# q2_val = pd.Series(cast_list_as_strings(list(val_df["question2"])))
# q1_test = pd.Series(cast_list_as_strings(list(test_df["question1"])))
# q2_test = pd.Series(cast_list_as_strings(list(test_df["question2"])))

# # 2. Prepare text for LSTM

# max_words = 20000
# max_len = 50
# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(all_questions)

# def texts_to_padded(q1,q2):
#     seq1 = tokenizer.texts_to_sequences(q1)
#     seq2 = tokenizer.texts_to_sequences(q2)
#     return pad_sequences(seq1, maxlen=max_len), pad_sequences(seq2, maxlen=max_len)

# X_train_q1, X_train_q2 = texts_to_padded(q1_train, q2_train)
# X_val_q1,   X_val_q2   = texts_to_padded(q1_val, q2_val)
# X_test_q1,  X_test_q2  = texts_to_padded(q1_test, q2_test)


# # 3. Build Siamese LSTM
# embed_dim = 256
# input_a = Input(shape=(max_len,))
# input_b = Input(shape=(max_len,))
# embedding_layer = Embedding(input_dim=max_words, output_dim=embed_dim, input_length=max_len, trainable=True)
# encoded_a = embedding_layer(input_a)
# encoded_b = embedding_layer(input_b)
# shared_lstm = LSTM(128)
# vector_a = shared_lstm(encoded_a)
# vector_b = shared_lstm(encoded_b)
# merged = concatenate([vector_a, vector_b])
# merged = Dropout(0.2)(merged)
# merged = Dense(64, activation='relu')(merged)
# output = Dense(1, activation='sigmoid')(merged)
# model_lstm = Model([input_a, input_b], output)
# model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # 4. Train LSTM model
# history = model_lstm.fit(
#     [X_train_q1, X_train_q2], y_train,
#     validation_data=([X_val_q1, X_val_q2], y_val),
#     epochs=5, batch_size=512
# )

# # 5. Evaluate LSTM
# y_pred_lstm = (model_lstm.predict([X_test_q1, X_test_q2]) > 0.5).astype(int)
# from sklearn.metrics import accuracy_score, recall_score, precision_score
# lstm_acc = accuracy_score(y_test, y_pred_lstm)
# lstm_rec = recall_score(y_test, y_pred_lstm)
# lstm_prec = precision_score(y_test, y_pred_lstm)
# print(f"LSTM Accuracy: {lstm_acc}, Recall: {lstm_rec}, Precision: {lstm_prec}")


In [None]:
if not check_model_saved(models_path, "model_lstm.pkl"):
    save_model(model_lstm, models_path, "model_lstm.pkl")