# Basic Sentence Similarity Approach
### Quora Question Pair Duplicacy Analysis

>Load the basic libraries 

In [1]:
import pandas as pd
import numpy as np

> Load the training dataset

In [5]:
train  = pd.read_csv('train.csv')
dup = train.pop('is_duplicate')

In [6]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


## Cleaning of text

> lowercasing the text

> removing non-ascii characters (if any)

> removing punctuations

> removing stopwords

> Lemmatizing the words 

In [9]:
#cleaning the text
import nltk
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(content):
    if type(content) == str:
        #lowercasing the text
        text = content.lower()
        # Removing non ASCII chars
        text = re.sub(r'[^\x00-\x7f]',r' ',text)
        # Removing (replacing with empty spaces actually) all the punctuations
        text = re.sub("["+string.punctuation+"]", " ", text)
        #splitting into words
        words=text.split()
        #removing the stop words in the text
        stop_word=set(stopwords.words('english'))
        words=list(word for word in words if not word in stop_word)
        words=[word for word in words if len(word)>1 ]
        words=[WordNetLemmatizer().lemmatize(word) for word in words]
        return ( " ".join(words) )
    else:
        return ""

In [10]:
train.question1 = train.question1.map(clean_text)
train.question2 = train.question2.map(clean_text)

## Preparing the TF-IDF matrix and Cosine similarity 

> TF = Term Frequency (frequency of word in a document)

> IDF = Inverse Document Frequency (log(N/n))
>>where N = total documents in a corpus, n = number of documents in corpus which contain the specific word 

> TF-IDF = TF*IDF

> TF-IDF matrix for generating a vector space [shape of matrix = (documents,words)]

> Cosine similairty between the documents based on the dot product of the vectors (cos(theta) = (vec1.vec2)/|vec1|.|vec2|))

In [50]:
def tokenize(content):
    return content.split()

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, analyzer='word', max_df=1.0, min_df=1)
cosine_vals = []

for i in train.id:
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform([train.loc[i]['question1'], train.loc[i]['question2']])
        cosine_vals.append(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1])
    except:
        cosine_vals.append(0)

## Analysis of  metrics 

In [37]:
from sklearn.metrics import accuracy_score, log_loss

print accuracy_score(dup, y_pred)

print log_loss(dup, y_pred)

0.663469291845
11.6235187115


## Testing time 

> Load the testing file

> Clean the text file

> TF-IDF and Cosine Similarity score 

In [38]:
test = pd.read_csv('test.csv')

In [40]:
ids = test.pop('test_id')
test.question1 = test.question1.map(clean_text)
test.question2 = test.question2.map(clean_text)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, analyzer='word', max_df=1.0, min_df=1)
cosine_vals_test = []
for i in ids:
    if i%100000 ==0: print i
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform([test.loc[i]['question1'], test.loc[i]['question2']])
        cosine_vals_test.append(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1])
    except:
        cosine_vals_test.append(0)
        

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


## Generating the test file for submission

In [45]:
columns = ['test_id','is_duplicate']
df = pd.DataFrame(columns = columns)
df['test_id'] = ids
df['is_duplicate'] = cosine_vals_test

df.to_csv('submission.csv',index = False)

### This is the most basic approach for the problem:

#### Other approaches can be :

> Doc2Vec

> Semantic Similarity based on Lesk Word Sense Disambiguation Algorithm