In [1]:
import os
os.chdir(r"C:\Users\anyaa\Documents\nlp\quora")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from scipy.sparse import hstack,vstack
import contractions
df=pd.read_csv("questions.csv")
df.head(3)
df=df.dropna()
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [2]:

# Lowercase (vectorized → fast)
df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()
# Contractions (still needs apply, but we optimize usage)
df['question1'] = df['question1'].map(contractions.fix)
df['question2'] = df['question2'].map(contractions.fix)
# Remove special characters (vectorized → fast)
df['question1'] = df['question1'].str.replace(r"[^a-zA-Z0-9?!\s]", " ", regex=True)
df['question2'] = df['question2'].str.replace(r"[^a-zA-Z0-9?!\s]", " ", regex=True)
# Remove extra spaces (vectorized → fast)
df['question1'] = df['question1'].str.replace(r"\s+", " ", regex=True).str.strip()
df['question2'] = df['question2'].str.replace(r"\s+", " ", regex=True).str.strip()

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df[['question1','question2']],
    df['is_duplicate'],
    test_size=0.2,
    random_state=42
)

In [4]:

import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser","ner"])
def lemmatize_series(series):
    return [
        " ".join([token.lemma_ for token in doc])
        for doc in nlp.pipe(series, batch_size=500)
    ]

X_train['question1'] = lemmatize_series(X_train['question1'])
X_train['question2'] = lemmatize_series(X_train['question2'])


In [5]:
# Precompute char lengths once
l1 = X_train['question1'].str.len()
l2 = X_train['question2'].str.len()

# Assign
X_train['lencq1'] = l1
X_train['lencq2'] = l2
X_train['lendiff'] = l1 - l2

# Word lengths
X_train['lenq1'] = X_train['question1'].str.split().str.len()
X_train['lenq2'] = X_train['question2'].str.split().str.len()

#common words
q1_words = X_train['question1'].str.split()
q2_words = X_train['question2'].str.split()
X_train['common_word_count'] = [
    len(set(a) & set(b)) for a,b in zip(q1_words, q2_words)
]
#jaccard similarity
def jaccard(q1, q2):
    w1 = set(str(q1).split())
    w2 = set(str(q2).split())
    if len(w1.union(w2)) == 0:
        return 0
    return len(w1.intersection(w2)) / len(w1.union(w2))

X_train['jaccard'] = [
    len(set(a.split()) & set(b.split())) /
    len(set(a.split()) | set(b.split()))
    if len(set(a.split()) | set(b.split())) > 0 else 0
    for a,b in zip(X_train['question1'], X_train['question2'])
]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=20000,ngram_range=(1,2),min_df=2)

tfidf.fit(X_train['question1'].tolist()+X_train['question2'].tolist())
q1 = tfidf.transform(X_train['question1'])
q2 = tfidf.transform(X_train['question2'])

X_tfidf_train = hstack([q1, q2])

train_cosine = q1.multiply(q2).sum(axis=1).A1
X_train['cosine_sim'] = train_cosine

In [7]:
#word to vec
from gensim.models import Word2Vec
sentences = (
    X_train['question1'].str.split().tolist() +
    X_train['question2'].str.split().tolist()
)
w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)
import numpy as np
def sent_vec(text):
    words = text.split()
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    
    if len(vecs) == 0:
        return np.zeros(100)
        
    return np.mean(vecs, axis=0)
q1_vec_train = np.vstack(X_train['question1'].apply(sent_vec))
q2_vec_train = np.vstack(X_train['question2'].apply(sent_vec))
X_train_w2v = np.hstack([q1_vec_train, q2_vec_train])

from numpy.linalg import norm

w2v_cos_train = (
    np.sum(q1_vec_train * q2_vec_train, axis=1) /
    (norm(q1_vec_train, axis=1) * norm(q2_vec_train, axis=1) + 1e-9)
)

X_train['w2v_cosine'] = w2v_cos_train

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [9]:
from scipy.sparse import csr_matrix
numeric_cols = ['lenq1','lenq2','lencq1','lencq2','lendiff','common_word_count','jaccard','cosine_sim','w2v_cosine']
num_train = csr_matrix(X_train[numeric_cols].to_numpy())

X_final_train = hstack([X_tfidf_train, num_train, X_train_w2v])


In [10]:

X_test['question1'] = lemmatize_series(X_test['question1'])
X_test['question2'] = lemmatize_series(X_test['question2'])
    
q1_vec_test = np.vstack(X_test['question1'].apply(sent_vec))
q2_vec_test = np.vstack(X_test['question2'].apply(sent_vec))

w2v_cos_test = (
    np.sum(q1_vec_test * q2_vec_test, axis=1) /a
    (norm(q1_vec_test, axis=1) * norm(q2_vec_test, axis=1) + 1e-9)
)
X_test['w2v_cosine'] = w2v_cos_test

X_test_w2v = np.hstack([q1_vec_test, q2_vec_test])

q1_test = tfidf.transform(X_test['question1'])
q2_test = tfidf.transform(X_test['question2'])


X_test['lenq1'] = X_test['question1'].str.split().str.len()
X_test['lenq2'] = X_test['question2'].str.split().str.len()

X_test['lencq1']=X_test['question1'].str.len()
X_test['lencq2']=X_test['question2'].str.len()
X_test['lendiff']=X_test['question1'].str.len()-X_test['question2'].str.len()
def common_words(q1, q2):
    w1 = set(str(q1).lower().split())
    w2 = set(str(q2).lower().split())
    return len(w1.intersection(w2))
X_test['common_word_count'] = [
    len(set(a.split()) & set(b.split()))
    for a,b in zip(X_test['question1'], X_test['question2'])
]

def jaccard(q1, q2):
    w1 = set(str(q1).lower().split())
    w2 = set(str(q2).lower().split())
    if len(w1.union(w2)) == 0:
        return 0
    return len(w1.intersection(w2)) / len(w1.union(w2))
X_test['jaccard'] = X_test.apply(
    lambda row: jaccard(row['question1'], row['question2']),
    axis=1
)
test_cosine = q1_test.multiply(q2_test).sum(axis=1).A1
X_test['cosine_sim'] = test_cosine
numeric_cols = [
    'lenq1',
    'lenq2',
    'lencq1','lencq2','lendiff',
    'common_word_count',
    'jaccard',
    'cosine_sim',
    'w2v_cosine'
]
from scipy.sparse import csr_matrix

num_test_sparse = csr_matrix(X_test[numeric_cols].values)
from scipy.sparse import hstack

X_tfidf_test = hstack([q1_test, q2_test])
X_final_test = hstack([X_tfidf_test, num_test_sparse,X_test_w2v])


In [11]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    colsample_bytree=0.8,
    max_depth=-1,
    random_state=42
)

model.fit(X_final_train, y_train)
from sklearn.metrics import accuracy_score, f1_score
y_pred = model.predict(X_final_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))


[LightGBM] [Info] Number of positive: 119676, number of negative: 203802
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 14.068276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1086822
[LightGBM] [Info] Number of data points in the train set: 323478, number of used features: 39476
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369966 -> initscore=-0.532361
[LightGBM] [Info] Start training from score -0.532361




Accuracy: 0.8203660195375294
F1: 0.7503394228951484


In [12]:
from sklearn.model_selection import cross_validate

scores = cross_validate(
    model,
    X_final_train,
    y_train,
    cv=5,
    scoring="f1",
    return_train_score=True,
    n_jobs=-1
)

print("Train F1:", scores["train_score"].mean())
print("Validation F1:", scores["test_score"].mean())


KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.2,solver="saga",max_iter=2200,class_weight="balanced")
model.fit(X_final_train, y_train)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value=0)

X_final_train = imputer.fit_transform(X_final_train)
X_final_test = imputer.transform(X_final_test)


In [None]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = model.predict(X_final_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))


In [None]:
import numpy as np

feature_importance = np.abs(model.coef_[0])
for col, imp in zip(numeric_cols, feature_importance[-len(numeric_cols):]):
    print(col, imp)
feature_names = tfidf.get_feature_names_out()

tfidf_weights = feature_importance[:len(feature_names)]


In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(
    model,
    X_final_train,
    y_train,
    cv=5,
    scoring='f1',
    return_train_score=True
)

print("Train F1:", scores['train_score'].mean())
print("Val F1:", scores['test_score'].mean())
