In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string

# Download NLTK stopwords (do this outside loop or .py file the first time)
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load only the first 30,000 rows from CSV
df = pd.read_csv('train.csv', nrows=30000)

# Preprocess function
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    return ' '.join([w for w in text.split() if w not in stop_words])

df['q1_clean'] = df['question1'].astype(str).apply(clean_text)
df['q2_clean'] = df['question2'].astype(str).apply(clean_text)

# Feature engineering
def jaccard_similarity(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    if len(a.union(b)) == 0:
        return 0
    return float(len(a.intersection(b)) / len(a.union(b)))

def word_share_ratio(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    if len(a) == 0:
        return 0
    return float(len(a.intersection(b)) / len(a))

features = pd.DataFrame()
features['jaccard'] = df.apply(lambda x: jaccard_similarity(x['q1_clean'], x['q2_clean']), axis=1)
features['len_diff'] = abs(df['q1_clean'].apply(len) - df['q2_clean'].apply(len))
features['word_share'] = df.apply(lambda x: word_share_ratio(x['q1_clean'], x['q2_clean']), axis=1)

# Bag-of-Words similarity (cosine similarity between vectors)
vectorizer = CountVectorizer().fit(df['q1_clean'].tolist() + df['q2_clean'].tolist())
q1_vec = vectorizer.transform(df['q1_clean'])
q2_vec = vectorizer.transform(df['q2_clean'])
from sklearn.metrics.pairwise import cosine_similarity
features['bow_cosine'] = [cosine_similarity(q1_vec[i], q2_vec[i])[0, 0] for i in range(q1_vec.shape[0])]

# Prepare Data
y = df['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
f1 = f1_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')
print(f'Accuracy: {acc:.4f}')

# Prediction function
def predict_similarity_ml(q1, q2):
    c1 = clean_text(q1)
    c2 = clean_text(q2)
    jacc = jaccard_similarity(c1, c2)
    length_diff = abs(len(c1) - len(c2))
    ws = word_share_ratio(c1, c2)
    bow1 = vectorizer.transform([c1])
    bow2 = vectorizer.transform([c2])
    bow_cos = cosine_similarity(bow1, bow2)[0, 0]
    feature_vec = np.array([[jacc, length_diff, ws, bow_cos]])
    pred = clf.predict(feature_vec)[0]
    return {'similarity_score': bow_cos, 'predicted_label': pred}

# Example
sample_result = predict_similarity_ml('How can I be a good geologist?', 'What should I do to be a great geologist?')
print(sample_result)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


F1 Score: 0.5686
Accuracy: 0.6785
{'similarity_score': 0.4999999999999999, 'predicted_label': 1}


