In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn import linear_model
import numpy as np
import time
import sys
from sklearn.model_selection import train_test_split
import scipy
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from string import digits
import re

In [None]:
df = pd.read_csv("/content/drive/MyDrive/University/datasets2020/datasets/q2b/train.csv")
df.head()

Unnamed: 0,Id,Question1,Question2,IsDuplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/University/datasets2020/datasets/q2b/test_without_labels.csv")
df_test.head()

Unnamed: 0,Id,Question1,Question2
0,283003,What can someone do if they've lost the wirele...,What is the best USB wireless mouse that can b...
1,283004,Why India need to elect Prime minister?,Is prime minister of India elected or appointed?
2,283005,How can I make money online with free of cost?,How can I make money online for free?
3,283006,Does MDMA affect the first and higher order mo...,Do antipsychotics affect the first and higher ...
4,283007,"I am a Saudi National and have ""SR 3 million"" ...",Where should I invest money to get high returns?


In [None]:
def getBOW(df, min_df = 100,ngrams = 1,analyzer = 'word'):
  # bag of letter sequences (chars)
  """
  BOW = TfidfVectorizer(min_df = min_df, analyzer=analyzer, ngram_range=(1,ngrams), lowercase=True)
  """

  BOW = TfidfVectorizer(min_df = min_df, analyzer=analyzer, ngram_range=(1,ngrams), lowercase=True)

  bow_df = pd.DataFrame(pd.concat((df['Question1'],df['Question2'])).unique(), columns=['Concatenated'])

  BOW.fit(bow_df['Concatenated'].values.astype('U'))
  print('Vocabulary size is {}'.format(len(BOW.vocabulary_)))
  return BOW

In [None]:
def blank_space(x):
  return re.sub('[^A-Za-z0-9]+', ' ', x)

def numbers(x):
  return re.sub(r'[0-9]+', '', x)

def standarize_sentence(x):
  return ''.join(''.join(word)[:2] for word in x) 

def apostrophe_words(x):
  Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will", 
           "'d":" would","'ve":" have","'re":" are"} 
  for key,value in Apos_dict.items(): 
      if key in x: 
          return x.replace(key,value)
  return x

def split_words(x):
  return " ".join([word for word in re.split("([A-Z][a-z]+[^A-Z]*)",x) if word])

def shallow_cleaning(df):
  remove_digits = str.maketrans('', '', digits)
  df['Question1'] = df['Question1'].apply(lambda x: blank_space(str(x)))
  df['Question1'] = df['Question1'].apply(lambda x: numbers(str(x)))
  df['Question1'] = df['Question1'].apply(lambda x: split_words(str(x)))
  df['Question1'] = df['Question1'].apply(lambda x: standarize_sentence(str(x)))
  df['Question1'] = df['Question1'].apply(lambda x: apostrophe_words(str(x)))
  df['Question1'] = df['Question1'].str.strip()
  df['Question1'] = df['Question1'].str.lower()

  df['Question2'] = df['Question2'].apply(lambda x: blank_space(str(x)))
  df['Question2'] = df['Question2'].apply(lambda x: numbers(str(x)))
  df['Question2'] = df['Question2'].apply(lambda x: split_words(str(x)))
  df['Question2'] = df['Question2'].apply(lambda x: standarize_sentence(str(x)))
  df['Question2'] = df['Question2'].apply(lambda x: apostrophe_words(str(x)))
  df['Question2'] = df['Question2'].str.strip()
  df['Question2'] = df['Question2'].str.lower()
  return df

#df = shallow_cleaning(df)
#df_test = shallow_cleaning(df_test)
#df.head()
#df_test.head()

In [None]:
def create_file(data, prediction, filename):
  res_df = pd.DataFrame(data)
  res_df['Predicted'] = prediction
  res_df.to_csv(filename+".csv", columns=['Id', 'Predicted'], index=False)
  return res_df

res_mean = pd.DataFrame([])
res_mean = res_mean.rename_axis('Statistic Measure', axis=1)
columnNum=0
def calculate_statistic_metrics(res, res_mean, columnName, columnNum):
  temp_df = pd.DataFrame([])
  accuracy_mean = pd.Series(res['test_accuracy'].mean(), name='Accuracy')
  precision_mean = pd.Series(res['test_precision_macro'].mean(), name='Precision')
  recall_mean = pd.Series(res['test_recall_macro'].mean(), name='Recall')
  F1_mean = pd.Series(res['test_f1_macro'].mean(), name='F-measure')
  temp_df = temp_df.append(accuracy_mean)
  temp_df = temp_df.append(precision_mean)
  temp_df = temp_df.append(recall_mean)
  temp_df = temp_df.append(F1_mean)
  res_mean[columnName] = temp_df[0]
  res_mean = res_mean.rename(columns={columnNum:columnName})
  return res_mean

**Logistic Regression**

In [None]:
def train_params(min_df = 100,ngrams = 1,analyzer = 'word',train = df, test=df_test):
  start_time = time.time()
  print('BOW and logistic regression')
  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  trainq1_trans = BOW.transform(train['Question1'].values.astype('U'))
  trainq2_trans = BOW.transform(train['Question2'].values.astype('U'))
  labels = train['IsDuplicate'].values

  X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
  y = labels
  #X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)
  model = linear_model.LogisticRegression(n_jobs=4,solver = 'sag',max_iter = 1000)

  print('5-Fold Cross Validation')
  scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
  scores = cross_validate(model, X, y, cv=5, n_jobs=4, scoring=scoring) 
  lgr_res = pd.DataFrame.from_dict(scores) 
  res = calculate_statistic_metrics(lgr_res, res_mean, 'Logistic Regression', columnNum)
  display(res)        
  
  print('fitting ...')
  model.fit(X,y)

  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  testq1_trans = BOW.transform(test['Question1'].values.astype('U'))
  testq2_trans = BOW.transform(test['Question2'].values.astype('U'))
  X_test = scipy.sparse.hstack((testq1_trans,testq2_trans))
  print('predicting ...')
  y_pred = model.predict(X_test)

  lgr_pred_df = create_file(test, y_pred, 'LGR_pred')

  end_time =time.time()
  print("total time elapsed is {}".format(end_time-start_time))

  display(lgr_pred_df)
  return 

train_params()

BOW and logistic regression
Vocabulary size is 3600
5-Fold Cross Validation


Statistic Measure,Logistic Regression
Accuracy,0.734816
Precision,0.719888
Recall,0.692977
F-measure,0.699943


fitting ...
Vocabulary size is 3600
predicting ...
total time elapsed is 49.208916664123535


Unnamed: 0,Id,Question1,Question2,Predicted
0,283003,What can someone do if they've lost the wirele...,What is the best USB wireless mouse that can b...,0
1,283004,Why India need to elect Prime minister?,Is prime minister of India elected or appointed?,1
2,283005,How can I make money online with free of cost?,How can I make money online for free?,1
3,283006,Does MDMA affect the first and higher order mo...,Do antipsychotics affect the first and higher ...,0
4,283007,"I am a Saudi National and have ""SR 3 million"" ...",Where should I invest money to get high returns?,0
...,...,...,...,...
121282,404285,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
121283,404286,Do you believe there is life after death?,Is it true that there is life after death?,1
121284,404287,What is one coin?,What's this coin?,0
121285,404288,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


**Linear SVM**

In [None]:
def train_params(min_df = 100,ngrams = 1,analyzer = 'word',train = df, test = df_test):
  start_time = time.time()
  print('BOW and Linear SVM')
  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  trainq1_trans = BOW.transform(train['Question1'].values.astype('U'))
  trainq2_trans = BOW.transform(train['Question2'].values.astype('U'))
  labels = train['IsDuplicate'].values

  X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
  y = labels  

  model = linear_model.SGDClassifier(n_jobs=4, penalty='l2', loss='hinge', random_state=42, max_iter = 1000)

  print('5-Fold Cross Validation')
  scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
  scores = cross_validate(model, X, y, cv=5, n_jobs=4, scoring=scoring) 
  svm_res = pd.DataFrame.from_dict(scores) 
  res = calculate_statistic_metrics(svm_res, res_mean, 'SVM', columnNum)
  display(res)        
  
  print('fitting ...')
  model.fit(X,y)

  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  testq1_trans = BOW.transform(test['Question1'].values.astype('U'))
  testq2_trans = BOW.transform(test['Question2'].values.astype('U'))
  X_test = scipy.sparse.hstack((testq1_trans,testq2_trans))
  print('predicting ...')
  y_pred = model.predict(X_test)

  svm_pred_df = create_file(test, y_pred, 'SVM_pred')

  end_time = time.time()
  print("total time elapsed is {}".format(end_time-start_time))

  display(svm_pred_df)
  return 

train_params()

**XGBoost**

In [None]:
import xgboost as xgb

In [None]:
def train_params(min_df = 100,ngrams = 1,analyzer = 'word',train = df, test = df_test):
  start_time = time.time()
  print('BOW and XGBoost')
  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  trainq1_trans = BOW.transform(train['Question1'].values.astype('U'))
  trainq2_trans = BOW.transform(train['Question2'].values.astype('U'))
  labels = train['IsDuplicate'].values

  X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
  y = labels  

  model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',n_jobs=4)
  
  print('5-Fold Cross Validation')
  scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
  scores = cross_validate(model, X, y, cv=5, n_jobs=4, scoring=scoring) 
  xgb_res = pd.DataFrame.from_dict(scores) 
  res = calculate_statistic_metrics(xgb_res, res_mean, 'XGB', columnNum)
  display(res)        
  
  print('fitting ...')
  model.fit(X,y)

  BOW = getBOW(train, min_df=min_df,ngrams = ngrams,analyzer = analyzer)
  testq1_trans = BOW.transform(test['Question1'].values.astype('U'))
  testq2_trans = BOW.transform(test['Question2'].values.astype('U'))
  X_test = scipy.sparse.hstack((testq1_trans,testq2_trans))
  print('predicting ...')
  y_pred = model.predict(X_test)

  xgb_pred_df = create_file(test, y_pred, 'XGB_pred')

  end_time = time.time()
  print("total time elapsed is {}".format(end_time-start_time))

  display(xgb_pred_df)
  return 

train_params()