# Baseline Model: Logistic Regression

## Import Libraries and Data Loading

In [36]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, precision_score

train_df_raw = pd.read_feather('../data/processed/train_dataset.feather')
val_df_raw = pd.read_feather('../data/processed/validation_dataset.feather')

In [37]:
train_df_raw.head(5)

Unnamed: 0,index,qid1,qid2,question1,question2,q1_cleaned,q2_cleaned,q1_trimmed,q2_trimmed,q1_start,q2_start,q1_topic,q2_topic,length_diff,same_question,lc_substring,lc_subsequence,jaccard_dist,common_words,common_ratio,levenshtein,fuzz_qratio,fuzz_wratio,q2_question_mark_count,q1_question_mark_count,question_mark_count_diff,freq_q1+q2,freq_q1-q2,same_topic,same_starting,same_ending,is_duplicate
0,0,307789,307790,I am stressed?,What is your favorite thing to do in Romania?,i am stressed,what is your favorite thing to do in romania,i am stressed,what is your favorite thing to do in romania,i,what,Social Media/Gadget/Email,Self-help/Learn/Business,-31,0,1,6,0.0,0,0.0,0.210526,21,32,1,1,0,2,0,0,0,0,0
1,1,49857,49858,I've had a stutter since I was a child and not...,Since I'm a severe stutterer will it be harder...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive,since,Language/Relationship,Food/Health,8,0,8,40,0.047945,3,0.018519,0.493827,48,54,1,1,0,2,0,0,0,0,0
2,2,118478,118479,What are the most trustworthy and fairly-price...,How do you unlock a Samsung cell phone?,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what,how,India/Government/China,Social Media/Gadget/Email,53,0,11,22,0.091743,2,0.015748,0.346457,34,86,1,1,0,4,2,0,0,0,0
3,3,89503,89504,How does straight talk use CDMA?,Does straight talk use GSM?,how doe straight talk use cdma,doe straight talk use gsm,how doe straight talk use cdma,doe straight talk use gsm,how,doe,Interview/Difference/Drug,Interview/Difference/Drug,5,0,22,23,0.62069,4,0.072727,0.836364,84,87,1,1,0,3,1,1,0,0,0
4,4,411021,411022,What was population of India in 1980?,What is the recent population of India?,what wa population of india in 1980,what is the recent population of india,what wa population of india in 1980,what is the recent population of india,what,what,Language/Relationship,Job/College/University,-2,0,20,25,0.354167,4,0.054795,0.684932,70,76,1,1,0,2,0,0,1,0,0


## Drop unneccessary columns

In [38]:
train_df = train_df_raw.drop(['index','qid1','qid2'],axis=1)
val_df = val_df_raw.drop(['index','qid1','qid2'],axis=1)
train_df.head(5)

Unnamed: 0,question1,question2,q1_cleaned,q2_cleaned,q1_trimmed,q2_trimmed,q1_start,q2_start,q1_topic,q2_topic,length_diff,same_question,lc_substring,lc_subsequence,jaccard_dist,common_words,common_ratio,levenshtein,fuzz_qratio,fuzz_wratio,q2_question_mark_count,q1_question_mark_count,question_mark_count_diff,freq_q1+q2,freq_q1-q2,same_topic,same_starting,same_ending,is_duplicate
0,I am stressed?,What is your favorite thing to do in Romania?,i am stressed,what is your favorite thing to do in romania,i am stressed,what is your favorite thing to do in romania,i,what,Social Media/Gadget/Email,Self-help/Learn/Business,-31,0,1,6,0.0,0,0.0,0.210526,21,32,1,1,0,2,0,0,0,0,0
1,I've had a stutter since I was a child and not...,Since I'm a severe stutterer will it be harder...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive,since,Language/Relationship,Food/Health,8,0,8,40,0.047945,3,0.018519,0.493827,48,54,1,1,0,2,0,0,0,0,0
2,What are the most trustworthy and fairly-price...,How do you unlock a Samsung cell phone?,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what,how,India/Government/China,Social Media/Gadget/Email,53,0,11,22,0.091743,2,0.015748,0.346457,34,86,1,1,0,4,2,0,0,0,0
3,How does straight talk use CDMA?,Does straight talk use GSM?,how doe straight talk use cdma,doe straight talk use gsm,how doe straight talk use cdma,doe straight talk use gsm,how,doe,Interview/Difference/Drug,Interview/Difference/Drug,5,0,22,23,0.62069,4,0.072727,0.836364,84,87,1,1,0,3,1,1,0,0,0
4,What was population of India in 1980?,What is the recent population of India?,what wa population of india in 1980,what is the recent population of india,what wa population of india in 1980,what is the recent population of india,what,what,Language/Relationship,Job/College/University,-2,0,20,25,0.354167,4,0.054795,0.684932,70,76,1,1,0,2,0,0,1,0,0


### For this baseline model, we will first try using the questions itself as the features.

In [39]:
bl_X_train, bl_X_val = train_df[['q1_cleaned','q2_cleaned']], val_df[['q1_cleaned','q2_cleaned']]
y_train, y_val = train_df[['is_duplicate']], val_df[['is_duplicate']]

In [40]:
bl_X_train.shape
bl_y_train.shape

(323429, 2)

(323429, 1)

## Data Processing

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(lowercase=False,)

train_q1 = bl_X_train['q1_cleaned'].apply(lambda x: x).tolist()
train_q2 = bl_X_train['q2_cleaned'].apply(lambda x: x).tolist()
train_list = train_q1 + train_q2

tfidf.fit_transform(train_list)

<646858x88988 sparse matrix of type '<class 'numpy.float64'>'
	with 6452661 stored elements in Compressed Sparse Row format>

In [42]:
train_td_q1 = tfidf.transform(bl_X_train['q1_cleaned'].apply(lambda x: x).tolist())
train_td_q2 = tfidf.transform(bl_X_train['q2_cleaned'].apply(lambda x: x).tolist())

val_td_q1 = tfidf.transform(bl_X_val['q1_cleaned'].apply(lambda x: x).tolist())
val_td_q2 = tfidf.transform(bl_X_val['q2_cleaned'].apply(lambda x: x).tolist())

In [43]:
X_train = train_td_q1 + train_td_q2
X_val  = val_td_q1 + val_td_q2

## Building the Baseline Model

In [44]:
log_clf = LogisticRegression(max_iter = 5000)
log_clf.fit(X_train, y_train.values.ravel())

LogisticRegression(max_iter=5000)

In [45]:
preds_train = log_clf.predict(X_train)
preds_prob_train = log_clf.predict_proba(X_train)
preds_val = log_clf.predict(X_val)
preds_prob_val = log_clf.predict_proba(X_val)

In [46]:
print("The train log loss is:", log_loss(y_train, preds_prob_train))
print("The train precision is:", precision_score(y_train, preds_train))
print("The validation log loss is:", log_loss(y_val, preds_prob_val))
print("The validation precision is:", precision_score(y_val, preds_val))

The train log loss is: 0.45686644439781177
The train precision is: 0.7593828104021647
The validation log loss is: 0.5034161442728341
The validation precision is: 0.7210036556995679


### We now will try using only integer features as the input

In [47]:
train_df.head(5)

Unnamed: 0,question1,question2,q1_cleaned,q2_cleaned,q1_trimmed,q2_trimmed,q1_start,q2_start,q1_topic,q2_topic,length_diff,same_question,lc_substring,lc_subsequence,jaccard_dist,common_words,common_ratio,levenshtein,fuzz_qratio,fuzz_wratio,q2_question_mark_count,q1_question_mark_count,question_mark_count_diff,freq_q1+q2,freq_q1-q2,same_topic,same_starting,same_ending,is_duplicate
0,I am stressed?,What is your favorite thing to do in Romania?,i am stressed,what is your favorite thing to do in romania,i am stressed,what is your favorite thing to do in romania,i,what,Social Media/Gadget/Email,Self-help/Learn/Business,-31,0,1,6,0.0,0,0.0,0.210526,21,32,1,1,0,2,0,0,0,0,0
1,I've had a stutter since I was a child and not...,Since I'm a severe stutterer will it be harder...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive had a stutter since i wa a child and not c...,since im a severe stutterer will it be harder ...,ive,since,Language/Relationship,Food/Health,8,0,8,40,0.047945,3,0.018519,0.493827,48,54,1,1,0,2,0,0,0,0,0
2,What are the most trustworthy and fairly-price...,How do you unlock a Samsung cell phone?,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what are the most trustworthy and fairlypriced...,how do you unlock a samsung cell phone,what,how,India/Government/China,Social Media/Gadget/Email,53,0,11,22,0.091743,2,0.015748,0.346457,34,86,1,1,0,4,2,0,0,0,0
3,How does straight talk use CDMA?,Does straight talk use GSM?,how doe straight talk use cdma,doe straight talk use gsm,how doe straight talk use cdma,doe straight talk use gsm,how,doe,Interview/Difference/Drug,Interview/Difference/Drug,5,0,22,23,0.62069,4,0.072727,0.836364,84,87,1,1,0,3,1,1,0,0,0
4,What was population of India in 1980?,What is the recent population of India?,what wa population of india in 1980,what is the recent population of india,what wa population of india in 1980,what is the recent population of india,what,what,Language/Relationship,Job/College/University,-2,0,20,25,0.354167,4,0.054795,0.684932,70,76,1,1,0,2,0,0,1,0,0


In [48]:
X_train = train_df.drop(['question1','question2','q1_cleaned','q2_cleaned','q1_trimmed','q2_trimmed','is_duplicate','q1_topic','q2_topic','q1_start','q2_start'],axis=1)
X_val = val_df.drop(['question1','question2','q1_cleaned','q2_cleaned','q1_trimmed','q2_trimmed','is_duplicate','q1_topic','q2_topic','q1_start','q2_start'],axis=1)

In [49]:
X_train.head(5)

Unnamed: 0,length_diff,same_question,lc_substring,lc_subsequence,jaccard_dist,common_words,common_ratio,levenshtein,fuzz_qratio,fuzz_wratio,q2_question_mark_count,q1_question_mark_count,question_mark_count_diff,freq_q1+q2,freq_q1-q2,same_topic,same_starting,same_ending
0,-31,0,1,6,0.0,0,0.0,0.210526,21,32,1,1,0,2,0,0,0,0
1,8,0,8,40,0.047945,3,0.018519,0.493827,48,54,1,1,0,2,0,0,0,0
2,53,0,11,22,0.091743,2,0.015748,0.346457,34,86,1,1,0,4,2,0,0,0
3,5,0,22,23,0.62069,4,0.072727,0.836364,84,87,1,1,0,3,1,1,0,0
4,-2,0,20,25,0.354167,4,0.054795,0.684932,70,76,1,1,0,2,0,0,1,0


In [None]:
log_clf.fit(X_train, y_train.values.ravel())

In [None]:
preds_train = log_clf.predict(X_train)
preds_prob_train = log_clf.predict_proba(X_train)
preds_val = log_clf.predict(X_val)
preds_prob_val = log_clf.predict_proba(X_val)

In [None]:
print("The train log loss is:", log_loss(y_train, preds_prob_train))
print("The train precision is:", precision_score(y_train, preds_train))
print("The validation log loss is:", log_loss(y_val, preds_prob_val))
print("The validation precision is:", precision_score(y_val, preds_val))