In [1]:
import os
from sklearn.metrics import accuracy_score
from typing import final
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re, os , sys
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import distance
import tqdm 
from sklearn.model_selection import train_test_split, GridSearchCV
import mlflow
import xgboost  as xgb
from nltk.corpus import stopwords
import spacy
from prefect import task, flow

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
 "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 
'each', 'few', 'more', 'most', 'other', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
"needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]



def load_data(path:str)->pd.DataFrame:
    try:
        if os.path.isfile(path):
            df = pd.read_csv(path)
            df = df.head(50000)
            return df
        else:
            print('check tha data path')
    except Exception as e:
        print(e)
   
def null_check(data=pd.DataFrame, flags=str)->pd.DataFrame:
    try:
        df = data
        if df.isnull().sum().sum():
            nan = df[df.isnull().any(1)]
            print(nan)
            print('-'*50)
            if flags == 'del':
                clean_data = df.dropna()
            else:
                clean_data = df.fillna(flags)
            return clean_data
        else:
            
            print('data_set has non null values')
        return data
    except Exception as e:
        print(e)

# funation for text preprocessing (Expanding contractions )
def preprocess_text(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    x = re.sub(r"http\S+", "", x)
    x = re.sub('\W', ' ', x) 
    bfs = BeautifulSoup(x) # removing html tage form the text
    x = bfs.get_text()
    x = x.strip()
    return x

def data_cleaning(data):
    data['Cl_question1'] = data['question1'].apply(preprocess_text)
    data['Cl_question2'] = data['question2'].apply(preprocess_text)
    return data


# function removing stopword and stemming or lemmatizer

def removing_stopword(data):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    data = data.split()
    x = [lemmatizer.lemmatize(word) for word in data if word not in stopwords]
    x = ' '.join(x)
    x = x.strip()
    return x

# function for lemmitization

def lemm_data(data):
    try:

        lemm = pd.DataFrame()
        lemm['id'] = data['id']
        lemm['lemm_data_q1'] = data.question1.apply(removing_stopword)
        lemm['lemm_data_q2'] = data.question2.apply(removing_stopword)
        return lemm.merge(data[['id','qid1','qid2','is_duplicate']],how='left',on='id')
    except Exception as e:
        print(e)
    
def doesMatch (q, match):
    q1, q2 = q['question1'], q['question2']
    q1 = str(q1).split()
    q2 = str(q2).split()
    if len(q1)>0 and len(q2)>0 and q1[match]==q2[match]:
        return 1
    else:
        return 0
    
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)
    

def common_stop_words_ratio(q,value):
    q1_tokens =str( q.question1).split()
    q2_tokens = str(q.question2).split()
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in stopwords])
    q2_stops = set([word for word in q2_tokens if word in stopwords])
    
    common_stop_count = len(q1_stops.intersection(q2_stops))
    if value == 'min':
        token_features = common_stop_count / (min(len(q1_stops), len(q2_stops)) + 0.0001)
    elif value == 'max':
        token_features = common_stop_count / (max(len(q1_stops), len(q2_stops)) + 0.0001)
    return token_features

def feature_extract(data):
    try:
    

        print('feature_extraction_start.....')
        data['len_q1'] = data.question1.str.len()
        data['len_q2'] = data.question2.str.len()
        data['q1_word'] = data.question1.apply(lambda x: len(str(x).split(' ')))
        data['q2_word'] = data.question2.apply(lambda x: len(str(x).split(' ')))
        
        data['total_word'] = data['q1_word'] + data['q2_word']
        data['differ_word_num'] = abs(data['q1_word'] - data['q2_word'])
        data['same_first_word'] = data.apply(lambda x: doesMatch(x, 0) ,axis=1)
        data['same_last_word'] = data.apply(lambda x: doesMatch(x, -1) ,axis=1)
        data['total_unique_word'] = data.apply(lambda x: len(set(str(x.question1).split()).union(set(str(x.question2).split()))) ,axis=1)
        data['total_unique_word_withoutstopword_num'] = data.apply(lambda x: len(set(str(x.question1).split()).union(set(str(x.question2).split())) - set(stopwords)) ,axis=1)
        data['total_unique_word_num_ratio'] = data['total_unique_word'] / data['total_word']
        print('......')
        data['common_word'] = data.apply(lambda x: len(set(str(x.question1).split()).intersection(set(str(x.question2).split()))) ,axis=1)
        data['common_word_ratio'] = data['common_word'] / data['total_unique_word'] # word share
        data['word_share'] = data['common_word']/data['total_word']
        data['common_word_ratio_min'] = data['common_word'] / data.apply(lambda x: min(len(set(str(x.question1).split())), len(set(str(x.question2).split()))) ,axis=1) 
        data['common_word_ratio_max'] = data['common_word'] / data.apply(lambda x: max(len(set(str(x.question1).split())), len(set(str(x.question2).split()))) ,axis=1) 
        
        data['common_stop_word_ratio_min'] = common_stop_words_ratio(data,'min')
        data['common_stop_word_ratio_max'] = common_stop_words_ratio(data, 'max')
        
        data['common_word_withoutstopword'] = data.apply(lambda x: len(set(str(x.question1).split()).intersection(set(str(x.question2).split())) - set(stopwords)) ,axis=1)
        data['common_word_withoutstopword_ratio'] = data['common_word_withoutstopword'] / data['total_unique_word_withoutstopword_num']
        
        data['common_word_withoutstopword_ratio_min'] = data['common_word_withoutstopword'] / data.apply(lambda x: min(len(set(str(x.question1).split()) - set(stopwords)), len(set(str(x.question2).split()) - set(stopwords))) ,axis=1) 
        data['common_word_withoutstopword_ratio_max'] = data['common_word_withoutstopword'] / data.apply(lambda x: max(len(set(str(x.question1).split()) - set(stopwords)), len(set(str(x.question2).split()) - set(stopwords))) ,axis=1) 
        
        print('fuzzy features...')
        print('fuzz_ratio.....')
        data["fuzz_ratio"] = data.apply(lambda x: fuzz.ratio(str(x.question1), str(x.question2)), axis=1)
        
        print('fuzz_partial_ratio.....')
        data["fuzz_partial_ratio"] = data.apply(lambda x: fuzz.partial_ratio(str(x.question1), str(x.question2)), axis=1)
        
        print('fuzz_token_set_ratio.....')
        data["fuzz_token_set_ratio"] = data.apply(lambda x: fuzz.token_set_ratio(str(x.question1), str(x.question2)), axis=1)
        
        print('fuzz_token_sort_ratio.....')
        data["fuzz_token_sort_ratio"] = data.apply(lambda x: fuzz.token_sort_ratio(str(x.question1), str(x.question2)), axis=1)
        
        print('longest_substr_ratio.....')
        data["longest_substr_ratio"]  = data.apply(lambda x: get_longest_substr_ratio(str(x.question1), str(x.question2)), axis=1)
        data.fillna(0, inplace=True)
        return data
    except Exception as e:
        print(e)

#TF-ITF with word2vec

#TF-ITF with word2vec

def TF_word2vec_q1(data):
    try:
        df = pd.DataFrame()
        questions = list(data['question1']) + list(data['question2'])

        tfidf = TfidfVectorizer(lowercase=False,)
        tfidf.fit_transform(questions)

        # dict key:word and value:tf-idf score
        word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        nlp = spacy.load('en_core_web_sm')


        vecs1 =[]
        # tqdm is used to print the progress bar
        print('vectorization start.....')
        for qu1 in tqdm.tqdm(list(data['question1'])):
            doc1 = nlp(qu1)
            #384 is the number of dimensions of vectors
            mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)]) 
            for word1 in doc1:
                #word2vec
                vec1 = word1.vector
                #fetch df score
                try:
                    idf = word2tfidf[str(word1)]
                except:
                    idf = 0

                # compute final vec
                mean_vec1 += vec1 * idf
            mean_vec1 = mean_vec1.mean(axis= 0)
            vecs1.append(mean_vec1)

        df['q1_feats_m'] = list(vecs1)
        df2_q1 = pd.DataFrame(df.q1_feats_m.values.tolist(), index= data.index)
        return df2_q1
    except Exception as e:
        print(e)

def TF_word2vec_q2(data):
    try:
        df = pd.DataFrame()
        questions = list(data['question1']) + list(data['question2'])

        tfidf = TfidfVectorizer(lowercase=False,)
        tfidf.fit_transform(questions)

        # dict key:word and value:tf-idf score
        word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        nlp = spacy.load('en_core_web_sm')


        vecs2 =[]
        # tqdm is used to print the progress bar
        print('vectorization start.....')
        for qu1 in tqdm.tqdm(list(data['question2'])):
            doc1 = nlp(qu1)
            #384 is the number of dimensions of vectors
            mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)]) 
            for word1 in doc1:
                #word2vec
                vec1 = word1.vector
                #fetch df score
                try:
                    idf = word2tfidf[str(word1)]
                except:
                    idf = 0

                # compute final vec
                mean_vec1 += vec1 * idf
            mean_vec1 = mean_vec1.mean(axis= 0)
            vecs2.append(mean_vec1)

        df['q1_feats_m'] = list(vecs2)
        df2_q2 = pd.DataFrame(df.q1_feats_m.values.tolist(), index= data.index)
        return df2_q2
        # vecs2 = []
        # for qu2 in tqdm.tqdm(list(data['question2'])):
        #     doc2 = nlp(qu2) 
        #     mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
        #     for word2 in doc2:
        #         # word2vec
        #         vec2 = word2.vector
        #         # fetch df score
        #         try:
        #             idf = word2tfidf[str(word2)]
        #         except:
        #             #print word
        #             idf = 0
        #         # compute final vec
        #         mean_vec2 += vec2 * idf
        #     mean_vec2 = mean_vec2.mean(axis=0)
        #     vecs2.append(mean_vec2)
        # data['q2_feats_m'] = list(vecs2)

        df2_q2 = pd.DataFrame(data.q2_feats_m.values.tolist(), index= data.index)
    
        return df2_q1,df2_q2
    except Exception as e:
        print(e)


def marge_data(question1_vec,question2_vec,feature_data_pr):
    try:    
    
        df = pd.read_csv('./train.csv')
        question1_vec['id']=df['id']
        question2_vec['id']=df['id']

        df1  = question1_vec.merge(question2_vec, on='id',how='left')
        final  = feature_data_pr.merge(df1, on='id',how='left')
        return final
    except Exception as e:
        print(e)

def split_data(x,y,test_ratio):
    try:
        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=test_ratio)
        return x_train,x_test,y_train,y_test
    except Exception as e:
        print(e)
        

def model_train(x_train,y_train,model:object, param:dict):

    mlflow.set_tracking_uri('sqlite:///mlflow5.db')
    mlflow.set_experiment('Quora_pair_question_problem5')
    mlflow.sklearn.autolog(max_tuning_runs=None)

    with mlflow.start_run():
        param_grid = param
        
        xg = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv = 5,
            scoring='neg_mean_squared_error',
            return_train_score= True,
            n_jobs = 1
        )
        xg.fit(x_train,y_train)
        
        #disabling autologging
        mlflow.sklearn.autolog(disable=True)
        print(xg.best_params_)
        t = xg.best_params_
    return model(learning_rate=t['learning_rate'],max_depth=t['max_depth'],min_child_weight=['min_child_weight'],n_estimators=t['n_estimators'])


def main(path:str,data_number:int):
    df_num = data_number
    # Define parameters
    Target_column = 'is_duplicate'
    data_path = path

    # load the data
    dataframe = load_data(data_path)
    dataframe = dataframe.head(df_num)

    # data cleaning
    clean_data = data_cleaning(dataframe)

    # feature extraction 
    extract_data = feature_extract(clean_data)

    # word2vec
    question_1 = TF_word2vec_q1(clean_data)
    question_2 = TF_word2vec_q2(clean_data)

    # merg dataset into one file
    final_data = marge_data(question_1,question_2, extract_data)
    final_data.drop(['qid1','qid2','question1','question2','Cl_question1','Cl_question2'],axis =1 , inplace=True)

    # Identify target varible
    input_data = final_data['is_duplicate']
    target_data =final_data


    # split the data into train and test
    x_train,x_test,y_train,y_test = split_data(target_data, input_data,test_ratio=0.25) 

    # model training
    param_ = {  
              'n_estimators': [40, 60, 80], 
              'max_depth': range(1, 4), 
              'learning_rate': [1e-3], 
              'min_child_weight': range(1, 4), 
             }
    model = model_train(x_train,y_train,xgb.XGBClassifier(),param=param_)
    y_pre = model.predict(x_test)
    print(accuracy_score(y_pre,y_test))


In [2]:
data = load_data('./train.csv')
data = data.head(500)

In [3]:
pre_data = data_cleaning(data)

In [4]:
extra_data = feature_extract(pre_data)

feature_extraction_start.....
......
fuzzy features...
fuzz_ratio.....
fuzz_partial_ratio.....
fuzz_token_set_ratio.....
fuzz_token_sort_ratio.....
longest_substr_ratio.....


In [5]:
extra_data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Cl_question1,Cl_question2,len_q1,len_q2,...,common_stop_word_ratio_max,common_word_withoutstopword,common_word_withoutstopword_ratio,common_word_withoutstopword_ratio_min,common_word_withoutstopword_ratio_max,fuzz_ratio,fuzz_partial_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,longest_substr_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,66,57,...,0.555552,5,0.625,0.833333,0.714286,93,98,100,93,0.965517
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,51,88,...,0.555552,3,0.25,0.6,0.3,65,73,86,63,0.442308
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,73,59,...,0.555552,2,0.166667,0.333333,0.25,45,41,63,63,0.15
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math i...,50,65,...,0.555552,0,0.0,0.0,0.0,7,20,28,24,0.039216
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,76,39,...,0.555552,1,0.0625,0.166667,0.090909,37,54,67,47,0.15


In [12]:
question_1 ['id']=extra_data['id']

In [13]:
question_1['id']

0        0
1        1
2        2
3        3
4        4
      ... 
495    495
496    496
497    497
498    498
499    499
Name: id, Length: 500, dtype: int64

In [6]:
clean_data = pre_data

In [7]:
question_1 = TF_word2vec_q1(clean_data)
question_2 = TF_word2vec_q2(clean_data)

vectorization start.....


100%|██████████| 500/500 [00:05<00:00, 89.71it/s] 


vectorization start.....


100%|██████████| 500/500 [00:05<00:00, 91.73it/s] 


In [8]:
 final_data = marge_data(question_1,question_2, extra_data)

In [10]:
final_data.merge(extra_data)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Cl_question1,Cl_question2,len_q1,len_q2,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,66,57,...,23.454691,0.674020,-7.689485,8.017969,-40.982749,-44.132399,36.545455,26.094779,8.286586,-10.927646
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,51,88,...,10.007542,2.172399,-21.943437,7.306729,-7.361206,-29.931989,-2.977271,11.206990,37.379291,-1.079838
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,73,59,...,14.705095,-13.560906,10.911867,26.596330,-48.137169,-50.337705,19.173741,-9.433019,15.220369,-0.265939
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math i...,50,65,...,0.103875,3.048119,9.530259,9.200311,-31.301418,-22.883158,-11.503366,-13.039456,9.274324,-33.010861
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,76,39,...,6.619158,6.906724,-20.575831,14.353661,-7.385053,-20.996447,20.379570,-9.969032,19.916848,-0.429476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,988,989,What is the painting on this image?,What is this painting?,0,what is the painting on this image,what is this painting,35,22,...,4.943079,2.973088,-0.541268,9.095978,-6.569571,-9.005506,6.117695,-10.918294,6.973176,-4.480150
496,496,990,991,Which are the major highways in California and...,Which are the major highways in California and...,0,which are the major highways in california and...,which are the major highways in california and...,104,100,...,21.169208,10.416384,-29.275103,13.112648,-27.348557,-16.100905,1.341413,-12.765768,8.850809,1.817075
497,497,992,993,What's beyond our Universe?,"If space is expanding, where does the new spac...",0,what is beyond our universe,if space is expanding where does the new spac...,27,58,...,18.016417,30.344229,-5.382860,26.147936,-11.395241,-21.436577,2.073962,-22.975147,31.533559,-22.074005
498,498,994,995,Is growing of hair a physical or a chemical ch...,Can a bald person ever grow their hair back?,0,is growing of hair a physical or a chemical ch...,can a bald person ever grow their hair back,51,44,...,-8.986114,13.469428,15.255748,2.418254,-17.205735,-37.589467,0.772018,-11.253944,26.276028,-13.192669


In [None]:
final_data.head()
final_data.drop(['qid1','qid2','question1','question2','Cl_question1','Cl_question2'],axis =1 , inplace=True)

In [None]:
pd.options.display.max_columns = None

In [None]:
final_data.head(2)

In [None]:
final_data.info()

In [None]:
Target_column = 'is_duplicate'
input_data = final_data[Target_column]
target_data =final_data.drop(['is_duplicate'],axis= 1)


In [None]:
target_data.head()

In [None]:
x_train,x_test,y_train,y_test = split_data(target_data, input_data,test_ratio=0.25) 


In [None]:
param_ = {  
              'n_estimators': [40, 60, 80], 
              'max_depth': range(1, 4), 
              'learning_rate': [1e-3], 
              'min_child_weight': range(1, 4), 
             }
model = model_train(x_train,y_train,xgb.XGBClassifier(),param=param_)
y_pre = model.predict(x_test)
print(accuracy_score(y_pre,y_test))


In [None]:
import xgboost  as xgb

In [None]:
xg = xgb.XGBClassifier(learning_rate = 0.001)