In [1]:
#steps 
#1: import all the important libraries
#2: extract important pairs from sklearn to work with 
#3: load the data from json 
#4: process and clean the data using certain regex rules
#5: define the tfidf model, feed all the data in it
#6: use the tfidf vectorizer tool to work with the data
#7: classify sentences using the feature weights
#8: use a cosine similarity matrix to calculate and predict the next words using the feature weights
#9: create a user interface to get user inputs
#based on all of the above steps, The Autocomplete will recognize the closest sentenses and rank 3 final proposals:

In [2]:
#Importing all the required libraries
import json
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import re

In [3]:
#Use of sklearn and extracting important pairs to work with 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances


In [4]:
#loaidng json data 
DATA_DIR = './'

def load_df(json_path='name.json'):
    
    df = pd.read_json(DATA_DIR+json_path)
    
    for column in ['Issues']:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [str(column+"_"+subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    
    
    df = pd.DataFrame([dict(y, index=i) for i, x in enumerate(df['Issues_Messages'].values.tolist()) for y in x])
    
    print(df.shape)
    return df


In [5]:
#process the data 
def splitDataFrameList(df,target_column,separator):
    
    
    def split_text(line, separator):
        splited_line =  [e+d for e in line.split(separator) if e]
        return splited_line
    
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df


In [6]:
#cleaning your data using few regex rules
class Autocompleter:
    def __init__(self):
        pass

    def import_json(self, json_filename):
        print("load json file...")
        df = load_df(json_filename)
        return df
        
    def process_data(self, new_df):

        print("select representative threads...")
        new_df = new_df[new_df.IsFromCustomer==False]
        
        print("split sentenses on punctuation...")
        for sep in ['. ',', ','? ', '! ', '; ']:
            new_df = splitDataFrameList(new_df, 'Text', sep)
            
        print("Text Cleaning using simple regex...")
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.strip("."))
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I '))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok'))
        new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:])
        new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x)
        
        print("calculate nb words of sentenses...")
        new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' ')))
        new_df = new_df[new_df['nb_words']>2]
        
        print("count occurence of sentenses...")
        new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count')
        
        print("remove duplicates (keep last)...")
        new_df = new_df.drop_duplicates(subset=['Text'], keep='last')
        
        new_df = new_df.reset_index(drop=True)
        print(new_df.shape)  
        
        return new_df
    

In [7]:
# define tfidf parameter in order to count/vectorize the description vector and then normalize it.
def calc_matrice(self, df):
        # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
        model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0)
        tfidf_matrice = model_tf.fit_transform(df['Text'])
        print("tfidf_matrice ", tfidf_matrice.shape)
        return model_tf, tfidf_matrice

In [8]:
#generate the similarity matrix and use it to predict the next words
def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
        
        prefix_string = str(prefix_string)
        new_df = data.reset_index(drop=True)
        weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values

        # tranform the string using the tfidf model
        tfidf_matrice_spelling = model_tf.transform([prefix_string])
        # calculate cosine_matrix
        cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling)
        
        #sort by order of similarity from 1 to 0:
        similarity_scores = list(enumerate(cosine_similarite))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:10]

        similarity_scores = [i for i in similarity_scores]
        similarity_indices = [i[0] for i in similarity_scores]

        #add weight to the potential results that had high frequency in orig data
        for i in range(len(similarity_scores)):
            similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i]

        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:3]
        similarity_indices_w = [i[0] for i in similarity_scores]
        
        return new_df.loc[similarity_indices_w]['Text'].tolist()

In [9]:
import autocompleter 
autocompl = autocompleter.Autocompleter()

In [10]:
#displaying the corpus
df = autocompl.import_json("sample_conversations.json")
df.shape, df.columns

load json file...
(22264, 3)


((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))

In [11]:
#first 10 values of the corpus after cleaning and normalising stuff 
df.head(10) 

Unnamed: 0,IsFromCustomer,Text,index
0,True,Hi! I placed an order on your website and I ca...,0
1,True,I think I used my email address to log in.,0
2,True,My battery exploded!,1
3,True,"It's on fire, it's melting the carpet!",1
4,True,What should I do!,1
5,True,I'm interested in upgrading my plan.,2
6,True,Can you tell me a bit about Prime?,2
7,True,"My friend has it, and it seems like a great deal",2
8,True,Hello,3
9,False,Hello Werner how may I help you today?,3


In [12]:
new_df = autocompl.process_data(df)
new_df.shape, new_df.columns

select representative threads...
split sentenses on punctuation...
Text Cleaning using simple regex...
calculate nb words of sentenses...
count occurence of sentenses...
remove duplicates (keep last)...
(8601, 5)


((8601, 5),
 Index(['IsFromCustomer', 'Text', 'index', 'nb_words', 'Counts'], dtype='object'))

In [13]:
model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)

tfidf_matrice  (8601, 99656)


In [16]:
#creating a user input 
#applying all the required steps
#using the prefix variable to take the input 
prefix = input()
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

let me


['Let me investigate', 'Let me assist you', 'Let me look']

In [19]:
prefix = input()
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

what can 


['What can I help you with today?',
 'Let me see what I can do',
 'I will see what I can do']

In [20]:
prefix = input()
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

do you need


['Do you need any more help?',
 'Do you need assistance in doing this?',
 'Which do you prefer?']