## Reproduce the results using the models trained and stored in the models folder:

In [58]:
from Utils import *

In [20]:
models_path='$HOME/Datasets/QuoraQuestionPairs/models'

In [53]:
# Create validation and test partitions
quora_df = pd.read_csv("$HOME/Datasets/QuoraQuestionPairs/quora_data.csv")
A_df, test_df = sklearn.model_selection.train_test_split(quora_df, test_size=0.05, random_state=123)
train_df, val_df = sklearn.model_selection.train_test_split(A_df, test_size=0.05)
print('val_df.shape=',val_df.shape)
print('test_df.shape=',test_df.shape)

y_val = val_df["is_duplicate"].values

# cast to list taking care of nans:
q1_val =  cast_list_as_strings(list(val_df["question1"]))
q2_val =  cast_list_as_strings(list(val_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))

val_df.shape= (15363, 6)
test_df.shape= (16172, 6)


## 1. Load the teachers baseline model and print accuracy:

Load Teachers baseline ad the vectoirizer used:

In [10]:
logistic = load_model(path=models_path, filename="teacher_baseline.pkl")
count_vectorizer = load_model(path=models_path, filename="count_vectorizer.pkl")

Model loaded from models/teacher_baseline.pkl
Model loaded from models/count_vectorizer.pkl


And now evaluate on the VALIDATION set:

In [11]:
from sklearn.metrics import accuracy_score

#get the validation feature for the count vectorizer:
X_val_q1q2 = get_features_from_df(val_df, count_vectorizer)

#accuracy of the regr with count vectorizer
y_pred = logistic.predict(X_val_q1q2)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy with count vectoriser: {accuracy:.4f}")

Accuracy with count vectoriser: 0.8082


## 2. Using custom TFIDF_Vectorizer:

In [13]:
#load model and vectorizer:
logistic2 = load_model(models_path, "custom_tfidf_logreg.pkl")
custom_vectorizer = load_model(models_path, "custom_tfidf_vectorizer.pkl")

Model loaded from models/custom_tfidf_logreg.pkl
Model loaded from models/custom_tfidf_vectorizer.pkl


In [14]:
from sklearn.metrics import accuracy_score

X_val_q1q2_tfidf = get_features_from_df_tfidf(val_df, custom_vectorizer)

y_pred = logistic2.predict(X_val_q1q2_tfidf)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy new approach with custom TFIDF vectorizer: {accuracy:.4f}")

Accuracy new approach with custom TFIDF vectorizer: 0.8563


## 3. Using built-in TFIDF vectorizer on charecter level with 4-grams, and a cosinus similarity between the pairs metric feature 

In [22]:
logistic4 = load_model(models_path, "tfidf_4_char_gram_logreg.pkl")
tfidf_vectorizer = load_model(models_path, "tfidf_vectorizer.pkl")

Model loaded from models/tfidf_4_char_gram_logreg.pkl
Model loaded from models/tfidf_vectorizer.pkl


In [28]:
from sklearn.metrics import accuracy_score
from Utils import *
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.sparse import hstack

val_df['char_ngram_sim'] = val_df.apply(lambda row: char_ngram_similarity(row['question1'], row['question2'], tfidf_vectorizer), axis=1)
char_sim_sparse = sparse.csr_matrix(val_df['char_ngram_sim'].values).T  # make it column vector

X_val_q1q2_bi_tfidf = get_features_from_df_tfidf(val_df, tfidf_vectorizer)

X_train_combined = hstack([X_val_q1q2_bi_tfidf, char_sim_sparse])

y_pred = logistic4.predict(X_train_combined)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy new approach with TFIDF vectorizer and 4-gram cosin-similarity: {accuracy:.4f}")

Accuracy new approach with custom TFIDF vectorizer: 0.8325


## 4. Using log regression with built in TFIDF, 4-gram character cosin similarity feature and feature whether it question starts with specific question word 

In [51]:
logistic5 = load_model(models_path, "tfidf_5_char_gram_word_starts_logreg.pkl")
scaler = load_model(models_path, "scaler_tfidf_5_char_gram_word_starts_logreg.pkl")
tfidf_vectorizer = load_model(models_path, "tfidf_vectorizer.pkl")
import pickle
# We need them in the same order as in training!!!
with open(models_path + '/feature_cols_train_start_words.pkl', 'rb') as f:
    feature_cols_train = pickle.load(f)

Model loaded from models/tfidf_5_char_gram_word_starts_logreg.pkl
Model loaded from models/scaler_tfidf_5_char_gram_word_starts_logreg.pkl
Model loaded from models/tfidf_vectorizer.pkl


In [54]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# start_words = ['how', 'can', 'what', 'why', 'are', 'do', 'does', 'is', 'should', 'could']

start_feats_q1 = val_df['question1'].apply(lambda x: pd.Series(starts_with_indicator(x)))
start_feats_q1.columns = [f'q1_{col}' for col in start_feats_q1.columns]

start_feats_q2 = val_df['question2'].apply(lambda x: pd.Series(starts_with_indicator(x)))
start_feats_q2.columns = [f'q2_{col}' for col in start_feats_q2.columns]

val_df = pd.concat([val_df, start_feats_q1, start_feats_q2], axis=1)

#print(feature_cols_train)
val_numeric_features = val_df[feature_cols_train]
val_numeric_features = val_numeric_features[feature_cols_train]
#print(val_numeric_features)

# Scale numeric features before combining with sparse ones

numeric_scaled = scaler.transform(val_numeric_features)
numeric_sparse = sparse.csr_matrix(numeric_scaled)

val_df['char_ngram_sim'] = val_df.apply(lambda row: char_ngram_similarity(row['question1'], row['question2'], tfidf_vectorizer), axis=1)
char_sim_sparse = sparse.csr_matrix(val_df['char_ngram_sim'].values).T  # make it column vector

X_val_q1q2_bi_tfidf = get_features_from_df_tfidf(val_df, tfidf_vectorizer)

# we're using the 4-gram 
X_train_combined = hstack([X_val_q1q2_bi_tfidf, char_sim_sparse, numeric_sparse])

['q1_starts_with_how', 'q1_starts_with_can', 'q1_starts_with_what', 'q1_starts_with_why', 'q1_starts_with_are', 'q1_starts_with_do', 'q1_starts_with_does', 'q1_starts_with_is', 'q1_starts_with_should', 'q1_starts_with_could', 'q2_starts_with_how', 'q2_starts_with_can', 'q2_starts_with_what', 'q2_starts_with_why', 'q2_starts_with_are', 'q2_starts_with_do', 'q2_starts_with_does', 'q2_starts_with_is', 'q2_starts_with_should', 'q2_starts_with_could']
        q1_starts_with_how  q1_starts_with_can  q1_starts_with_what  \
53593                    0                   0                    0   
100923                   0                   0                    1   
226707                   1                   0                    0   
25127                    0                   1                    0   
15382                    0                   0                    0   
...                    ...                 ...                  ...   
249858                   0                   0      

In [55]:
y_pred = logistic5.predict(X_train_combined)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy new approach with TFIDF vectorizer and 4-gram cosin-similarity and feature for start word: {accuracy:.4f}")

Accuracy new approach with TFIDF vectorizer and 4-gram cosin-similarity and feature for start word: 0.6071


## 5. Use Jackard similarity and TFIDF vectorizer

In [57]:
logistic6 = load_model(models_path, "tfidf_jackard_logreg.pkl")
tfidf_vectorizer = load_model(models_path, "tfidf_vectorizer.pkl")

Model loaded from models/tfidf_jackard_logreg.pkl
Model loaded from models/tfidf_vectorizer.pkl


In [60]:
val_df['jaccard_sim'] = val_df.apply(lambda row: jaccard_similarity(row['question1'], row['question2']), axis=1)

X_val_q1q2_bi_tfidf = get_features_from_df_tfidf(val_df, tfidf_vectorizer)

# we're using the 4-gram 
X_train_combined = hstack([X_val_q1q2_bi_tfidf, val_df['jaccard_sim'].values.reshape(-1, 1)])

In [61]:
y_pred = logistic6.predict(X_train_combined)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy new approach with TFIDF vectorizer and Jackard similarity: {accuracy:.4f}")

Accuracy new approach with TFIDF vectorizer and 4-gram cosin-similarity and feature for start word: 0.8200


## 6. Custom TFIDF and Jackard similarity:

In [68]:
logistic7 = load_model(models_path, "custom_tfidf_jackard_logreg.pkl")
custom_vectorizer = load_model(models_path, "custom_tfidf_vectorizer.pkl")

Model loaded from models/custom_tfidf_jackard_logreg.pkl
Model loaded from models/custom_tfidf_vectorizer.pkl


In [69]:
val_df['jaccard_sim'] = val_df.apply(lambda row: jaccard_similarity(row['question1'], row['question2']), axis=1)

X_val_q1q2_tfidf = get_features_from_df_tfidf(val_df, custom_vectorizer)

# we're using the 4-gram 
X_train_combined = hstack([X_val_q1q2_tfidf, val_df['jaccard_sim'].values.reshape(-1, 1)])

In [70]:
y_pred = logistic7.predict(X_train_combined)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy new approach with custom TFIDF vectorizer and Jackard similarity: {accuracy:.4f}")

Accuracy new approach with custom TFIDF vectorizer and Jackard similarity: 0.8735
