# Let's build a "next word predictor" using Text messages

2/18/17 - Z. W. Miller

We'll be using this dataset to build from: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import sklearn
import matplotlib
import sys
import scipy

libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('Scipy', scipy), ('Sklearn', sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
Scipy Version: 0.19.1
Sklearn Version: 0.19.0


In [44]:
import numpy as np
from collections import defaultdict
from collections import Counter

class next_word_predictor:
    
    def __init__(self, text, from_file=True, ngram=2):
        """
        
        """
        self.ngram = int(ngram)
        self.keys = dict()
        self._from_file = from_file
        if type(text) != type("string"):
            raise TypeError("'text' must be a PATH or string object")
        if from_file:
            self.path = text
        else:
            self.raw = text
        self.text_as_list = None
        self.create_probability_object()

    def preprocess(self):
        """
        Opens and cleans the text to be learned from. If self.from_file, it reads
        from the path provided. The cleaning is very minor, just lowercasing
        and getting rid of quotes. Creates a list of words from the text.
        """
        if self._from_file:
            with open(self.path,'r') as f:
                self.raw = f.read()
        self.text_as_list = self.clean(self.raw)
        
    def group_generator(self,text_as_list):
        """
        Generator that creates the ngram groupings to act as keys.
        Just grabs ngram number of words and puts them into a tuple
        and yields that upon iteration request.
        ---
        Inputs
            text_as_list: the text after preprocessing (list)
        Outputs
            keys: word groupings of length self.ngram (tuple)
        """
        if len(text_as_list) < self.ngram+1:
            raise ValueError("NOT A LONG ENOUGH TEXT!")
            return

        for i in range(self.ngram,len(text_as_list)):
            yield tuple(text_as_list[i-self.ngram:i+1])

    def create_probability_object(self):
        """
        Steps through the text, pulling keys out and keeping track
        of which words follow the keys. Duplication is allowed for 
        values for each key - but all keys are unique.
        """
        if not self.text_as_list:
            self.preprocess()
        for group in self.group_generator(self.text_as_list):
            word_key = tuple(group[:-1])
            if word_key in self.keys:
                if group[-1] in self.keys[word_key].keys():
                    self.keys[word_key][group[-1]] += 1
                else:
                    self.keys[word_key][group[-1]] = 1
            else:
                self.keys[word_key] = {group[-1]: 1}
                
        self.common_words = Counter(self.text_as_list).most_common(3)
        self.common_words, _ = zip(*self.common_words)
        self.common_words = list(self.common_words)
        
    def clean(self, txt):
        return (txt.lower().replace('"','').replace("'","").replace("/","")
                .replace(".","").replace("!","").replace(",","").replace("?","").split())
    
    def get_next_word_recs(self, current_text):
        assert len(current_text.split()) >= self.ngram, "Not enough words in input text!"
        
        to_predict_from = self.clean(current_text)[-self.ngram:]
        key_search = tuple(to_predict_from)
        if key_search in self.keys:
            top_3 = sorted(self.keys[key_search].items(), key=lambda x: x[1], reverse=True)[:3]
            rec_words, _ = zip(*top_3)
            rec_words = list(rec_words)
            words_to_add = 3 - len(rec_words)
            rec_words = rec_words + self.common_words[:words_to_add]
            return(rec_words)
        else:
            return self.common_words
        
    def print_key_value_pairs(self, num_keys=20):
        """
        Iterates through the probability object, printing key-value
        pairs. 
        ---
        Input
        num_keys: how many pairs to show (int)
        """
        i = 1
        for key,value in self.keys.items():
            print(key,value)
            print()
            i+=1
            if i>int(num_keys):
                break

## Read from the text messages in the dataset

In [45]:
df = pd.read_csv('./smsspamcollection/SMSSpamCollection',sep='\t', header=None)
text = ""
for line in df[1]:
    text += line + " "
    
nwp = next_word_predictor(text[:-1], from_file=False, ngram=2)

In [26]:
nwp.print_key_value_pairs(num_keys=100)

('go', 'until') {'jurong': 1}

('until', 'jurong') {'point,': 1}

('jurong', 'point,') {'crazy..': 1}

('point,', 'crazy..') {'available': 1}

('crazy..', 'available') {'only': 1}

('available', 'only') {'in': 1}

('only', 'in') {'bugis': 1, 'their': 1}

('in', 'bugis') {'n': 1}

('bugis', 'n') {'great': 1}

('n', 'great') {'world': 1}

('great', 'world') {'la': 1}

('world', 'la') {'e': 1}

('la', 'e') {'buffet...': 1}

('e', 'buffet...') {'cine': 1}

('buffet...', 'cine') {'there': 1}

('cine', 'there') {'got': 1}

('there', 'got') {'amore': 1}

('got', 'amore') {'wat...': 1}

('amore', 'wat...') {'ok': 1}

('wat...', 'ok') {'lar...': 1}

('ok', 'lar...') {'joking': 1}

('lar...', 'joking') {'wif': 1}

('joking', 'wif') {'u': 1}

('wif', 'u') {'oni...': 1}

('u', 'oni...') {'free': 1}

('oni...', 'free') {'entry': 1}

('free', 'entry') {'in': 4, '2': 7, 'into': 3, 'to': 1}

('entry', 'in') {'2': 5}

('in', '2') {'a': 5, 'or': 1}

('2', 'a') {'wkly': 2, 'flyng': 1, 'weekly': 3, 'marri

In [62]:
nwp.get_next_word_recs("now and")

['tell', 'lets', 'no']