In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load data
train_df = pd.read_csv('drive/MyDrive/Data/train.csv')
test_df = pd.read_csv('drive/MyDrive/Data/test.csv')

  test_df = pd.read_csv('drive/MyDrive/Data/test.csv')


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Fill missing values
train_df['question1'] = train_df['question1'].fillna('')
train_df['question2'] = train_df['question2'].fillna('')
test_df['question1'] = test_df['question1'].fillna('')
test_df['question2'] = test_df['question2'].fillna('')

In [5]:
# Preprocessing functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


In [6]:
# Clean text
train_df['question1'] = train_df['question1'].apply(clean_text)
train_df['question2'] = train_df['question2'].apply(clean_text)
test_df['question1'] = test_df['question1'].apply(clean_text)
test_df['question2'] = test_df['question2'].apply(clean_text)

In [7]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
tfidf_q1 = tfidf.fit_transform(train_df['question1'])
tfidf_q2 = tfidf.transform(train_df['question2'])

tfidf_q1_test = tfidf.transform(test_df['question1'])
tfidf_q2_test = tfidf.transform(test_df['question2'])

In [8]:
# Function to compute cosine similarity in batches
def batch_cosine_similarity(matrix1, matrix2, batch_size=1000):
    num_batches = int(np.ceil(matrix1.shape[0] / batch_size))
    similarities = np.zeros(matrix1.shape[0])
    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, matrix1.shape[0])
        similarities[start:end] = cosine_similarity(matrix1[start:end], matrix2[start:end]).diagonal()
    return similarities

In [9]:
# Optimized cosine similarity
train_cosine_similarities = batch_cosine_similarity(tfidf_q1, tfidf_q2)
test_cosine_similarities = batch_cosine_similarity(tfidf_q1_test, tfidf_q2_test)

In [10]:
train_df['cosine_similarity'] = train_cosine_similarities
test_df['cosine_similarity'] = test_cosine_similarities

In [11]:
# Features and labels
X_train = train_df[['cosine_similarity']]
y_train = train_df['is_duplicate']
X_test = test_df[['cosine_similarity']]

In [12]:
# Split data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [13]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

In [14]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val_split)
    print(f'{name} Accuracy: {accuracy_score(y_val_split, y_pred)}')
    print(classification_report(y_val_split, y_pred))

Logistic Regression Accuracy: 0.6486433006010537
              precision    recall  f1-score   support

           0       0.69      0.80      0.74     50803
           1       0.54      0.40      0.46     30055

    accuracy                           0.65     80858
   macro avg       0.61      0.60      0.60     80858
weighted avg       0.63      0.65      0.64     80858

Random Forest Accuracy: 0.6505355066907418
              precision    recall  f1-score   support

           0       0.74      0.69      0.71     50803
           1       0.53      0.59      0.56     30055

    accuracy                           0.65     80858
   macro avg       0.63      0.64      0.63     80858
weighted avg       0.66      0.65      0.65     80858

XGBoost Accuracy: 0.6491874644438398
              precision    recall  f1-score   support

           0       0.69      0.81      0.74     50803
           1       0.54      0.38      0.44     30055

    accuracy                           0.65     80858