## Part 2: Kaggle Competition

#### Import Package 

In [1]:
import pandas as pd
import gensim
import numpy as np

In [2]:
from collections import defaultdict
from gensim import corpora
from gensim import models
from gensim import similarities

In [3]:
import nltk
import re
from nltk.tokenize import word_tokenize #this package tokenize word by whitespace and punctuation
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

#### Read and Merge Datasets

In [4]:
tsv_read = pd.read_csv('chatbot.tsv', engine='python',sep='\n|\t',encoding="utf8") #read large tsv file
# tsv_read.shape

In [113]:
# get distinct messages and responses from tsv file 
message=tsv_read[['message_id','message']].drop_duplicates()
responses=tsv_read[['response_id','response']].drop_duplicates()

In [114]:
rating=pd.read_csv('aggregated-hw3-ratings.train.csv',encoding="utf8",header=None) # read from training dataset
rating.shape #(17373, 3)
# rating.head(5)

(17373, 3)

In [115]:
rating=rating.rename(columns={0: "message_id", 1: "response_id", 2:'rating'})
rating_df=rating.merge(message,how='left',on='message_id')
rating_df=rating_df.merge(responses,how='left',on='response_id')

In [None]:
# Get corpus for training set and full set

In [None]:
documents=responses[['response','response_id']].drop_duplicates('response').reset_index()
# documents
# len(documents)

In [None]:
responses=documents['response']

#### Pre-Processing functions to get bag of words

In [5]:
def lower_url(file):
    file_lowered=file.lower() 
    file_url=re.sub(r'^https?:\/\/.*[\r\n]*', '', file_lowered, flags=re.MULTILINE) #remove url
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', file_url)
# lower vocab and remove url and emoji

In [6]:
def remove_pn(tokens):
    #remove punctuations & numbers
    filtered_words = [word for word in tokens if word.isalpha()]
    return filtered_words
# remove punctuations 

In [7]:
def remove_stop(tokens):
    f = open("stoplist.txt", "r")
    stoplist=f.read()
    words = [w for w in tokens if not w in stoplist]
    return words
# remove stopwords

In [8]:
wnl = WordNetLemmatizer()
def get_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def lemma_token(tokens):
    tagged_words=nltk.pos_tag(tokens)
    new_token = []
    for i in tagged_words:
        wordnet_pos = get_pos(i[1]) #or wordnet.NOUN
        new_token.append(wnl.lemmatize(i[0],pos=wordnet_pos))
    return new_token
# lemmetize my bag of word

#### get index for train documents

In [None]:
texts_doc = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in test_documents
]

# remove words that appear only once
frequency_test = defaultdict(int)
for text in texts_doc:
    for token in text:
        frequency_test[token] += 1

texts_doc = [
    [token for token in text if frequency_test[token] > 1]
    for text in texts_doc
]

dictionary = corpora.Dictionary(texts_doc)
corpus = [dictionary.doc2bow(text) for text in texts_doc]

In [None]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

#### Get Index for all documents
this is when I use whole documents to get ranked responses. It turned out not really good

In [None]:
frequency = defaultdict(int)
for text in cleandb:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in cleandb
]

dictionary = corpora.Dictionary(texts)
corpus_all = [dictionary.doc2bow(text) for text in texts]

In [None]:
from gensim import models
lsi = models.LsiModel(corpus_all, id2word=dictionary, num_topics=3)

In [None]:
response_id=[]
for i in testing['messagd_id'].unique():
        query=message[message['message_id']==i].message.values[0]
        vec_bow = dictionary.doc2bow(query.lower().split())
        vec_lsi = lsi[vec_bow]  # convert the query to LSI space
        sims = index[vec_lsi] 
        newsim = sorted(enumerate(sims), key=lambda item: -item[1])
        for doc in newsim[0:10]:
            #append response_id and 
            response_id.append((i,documents.iloc[doc[0]].response_id))
        print("finished get documents for query")

In [None]:
df = pd.DataFrame(response_id, columns=["message_id", "response_id"])

In [None]:
# df.to_csv('kaggle_2.csv',index=False)

#### Get testing data

In [9]:
testing=pd.read_csv('aggregated-hw3-rating.test.csv',encoding="utf8")
message_list=testing['messagd_id'].unique() #get all messages in test file

#### Testing BM25

In [10]:
from rank_bm25 import BM25Okapi

In [11]:
i=message_list[0]
message_db=tsv_read[tsv_read['message_id']==i]
test_response=message_db['response']
lowered_doc=[lower_url(i) for i in test_response]
tokened_doc=[word_tokenize(i) for i in lowered_doc]
removed=[remove_pn(i) for i in tokened_doc]
cleandb=[remove_stop(i) for i in removed]
pure_db=[lemma_token(i) for i in cleandb]      
query=message_db['message'].unique()[0]
tokenized_query = query.split(" ")

In [30]:
pure_db
bm25=BM25Okapi(pure_db)
# len(test_response)
result=bm25.get_top_n(tokenized_query, test_response.to_list(), n=10)
for doc in result:
#         print(doc)
    id=message_db[message_db['response']==doc].response_id.values[0]

In [None]:
message_db

In [None]:
bm_25result=[]
for i in message_list:
    message_db=tsv_read[tsv_read['message_id']==i]
    test_response=message_db['response']
    lowered_doc=[lower_url(i) for i in test_response]
    tokened_doc=[word_tokenize(i) for i in lowered_doc]
    removed=[remove_pn(i) for i in tokened_doc]
    cleandb=[remove_stop(i) for i in removed]
    pure_db=[lemma_token(i) for i in cleandb]      
    query=message_db['message'].unique()[0]
    tokenized_query = query.split(" ")
    if len(pure_db)<10:
        print("not enough")
        message_db=tsv_read[['response','response_id']].sample(n=50).drop_duplicates('response_id')
        test_response=message_db['response']
        lowered_doc=[lower_url(i) for i in test_response]
        tokened_doc=[word_tokenize(i) for i in lowered_doc]
        removed=[remove_pn(i) for i in tokened_doc]
        cleandb=[remove_stop(i) for i in removed]
        pure_db=[lemma_token(i) for i in cleandb] 
        bm25 = BM25Okapi(pure_db)
        
    else:
        bm25=BM25Okapi(pure_db)
    result=bm25.get_top_n(tokenized_query, test_response.to_list(), n=10)
    for doc in result:
#         print(doc)
        id=message_db[message_db['response']==doc].response_id.values[0]
        bm_25result.append((i,id))
    print('finishing getting query')

In [None]:
bm_25result

In [14]:
df_25 = pd.DataFrame(bm_25result, columns=["message_id", "response_id"])

In [16]:
# df_25.to_csv('kaggle_1116.csv',index=False)

#### Use Gensim-similarity query to predict

In [84]:
def get_corpus(test_response,query):
#     texts_doc = [
#       [word for word in document.lower().split() if word not in stoplist]
#       for document in test_response]

# remove words that appear only once
    frequency_test = defaultdict(int)
    for text in test_response:
        for token in text:
            frequency_test[token] += 1

    texts_doc = [
        [token for token in text if frequency_test[token] > 1]
        for text in test_response
        ]

    dictionary = corpora.Dictionary(texts_doc)
    corpus = [dictionary.doc2bow(text) for text in texts_doc]
    lsi_new = models.LsiModel(corpus, id2word=dictionary, num_topics=3)
    index = similarities.MatrixSimilarity(lsi_new[corpus])  
    vec_bow = dictionary.doc2bow(query.lower().split())
    vec_lsi = lsi_new[vec_bow]  # convert the query to LSI space
    sims_1 = index[vec_lsi] 
    newsim = sorted(enumerate(sims_1), key=lambda item: -item[1])
   
    return newsim # get  

In [58]:
message_db=tsv_read[tsv_read['message_id']==message_list[268]].reset_index(drop=True)

In [59]:
message_db

Unnamed: 0,message_id,message,response_id,response
0,dtncx79,Is the second test goint to be option 2? I thi...,dtndoho,Where is this second test? I’m confused


In [None]:
new_result=[]

for i in message_list:
    message_db=tsv_read[tsv_read['message_id']==i].reset_index(drop=True)
    test_response=message_db['response']
    lowered_doc=[lower_url(i) for i in test_response]
    tokened_doc=[word_tokenize(i) for i in lowered_doc]
    removed=[remove_pn(i) for i in tokened_doc]
    cleandb=[remove_stop(i) for i in removed]
    pure_db=[lemma_token(i) for i in cleandb]    
    query=message_db['message'].unique()[0]
    if len(test_response)<10:
        message_db=tsv_read[['response','response_id']].drop_duplicates('response_id')
        test_response=message_db['response']
#         for i in range(0,10):
#             new_result.append((i,'none'))
#         print("not enough query")
# #         message_db=tsv_read[['response','response_id']].drop_duplicates('response_id')
# #         test_response=message_db['response']
    sim=get_corpus(pure_db,query)
    for doc in sim[0:10]:
        new_result.append((i,message_db.iloc[doc[0]].response_id))
    print('finish getting query')

In [95]:
df = pd.DataFrame(new_result, columns=["message_id", "response_id"])

#### Get prediction of result

In [103]:
df.to_csv('kaggle_1114.csv',index=False)

In [None]:
# test result