### Keyword Maker
The goal of this project is to expand upon product information supplied in regular invoice/PO data to understand what the product actually is.  This is done by mimicing human processes of googling the description and then scraping all results.  

In [None]:
import mechanize
import pandas as pd
from bs4 import BeautifulSoup
import collections
import re
from nltk.stem import WordNetLemmatizer
import numpy as np
from bs4.element import Comment


### Create custom functions to help streamline the process

In [None]:
def GetWebResults(SearchString):
    #Spoof the Browser
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.set_handle_equiv(False)
    br.addheaders = [('User-agent', 'Mozilla/5.0')] 
    br.open('http://www.google.com/')   

    # do the query and return the text
    br.select_form(name='f')   
    br.form['q'] = SearchString # query
    data = br.submit()
    soup = BeautifulSoup(data.read(), "html5lib")

    WebResults = []

    #Put all the description words into a wordlist
    for i in soup.findAll("span", { "class" : "st" }):
        WebResults.append(i.text)

    return WebResults

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    #soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def CleanWords(listofWords):

    #Assign the WebResults to a new variable so I can run this code multiple times without having to scrape the web
    wordlist = ''.join(listofWords).lower().split()

    #Remove specific Characters from the wordlist
    wordlist = [i.replace(',',' ').replace('-',' ').replace('.',' ').replace('\t',' ') for i in wordlist]

    #Remove non-alphaNumeric and replace them with a space
    wordlist = [re.sub('[^ a-zA-Z0-9]+', ' ', i) for i in wordlist]

    #Remove double spaces and replace them with a single space
    wordlist = [re.sub('[  ]+', ' ', i) for i in wordlist]
    
    #Remove the product information from the keywords (we already know it, so what add it) and lowercase the list of words
    #wordlist = [i.lower() for i in wordlist if i.lower() not in SearchString.lower()]
    
    #remove specific stopwords from the list
    stopwords = ['...', '....', ' ','', 'the', 'in', 'to','of', 'and', 'at', 'for', 'you',
                 'with', 'is', '-', '/', 'on', 'case', 'a', 'website', 'registered', 'com',
                 'back', 'all', 'product', 'back', 'available', 'no', 'price']
    wordlist = [i.strip() for i in wordlist if i.strip() not in stopwords]

    #Lemmitize the words to remove pluralization
    wordnet_lemmatizer = WordNetLemmatizer()
    wordlist = [wordnet_lemmatizer.lemmatize(i) for i in wordlist]

    print('\nWordlist is cleaned and contains %i records\n' %len(wordlist))
    
    return wordlist

### Execute the code to search for keywords

In [None]:
keyWords = []

#Read in spreadsheet of part information
itemList = pd.read_excel('CapitalEquipmentExamples.xlsx')
itemList.head(2)

#Format the spreadsheet for ease of use
itemList = itemList.transpose()
#Rename the columns
itemList.columns = itemList.iloc[0]
#Drop the old header in favor of the indexed header
itemList = itemList.drop(itemList.index[0])

#Create the search term column
itemList['SearchTerm'] = itemList['Manufacturer Part Number'].astype(str) + ' ' + itemList['Manufacturer Name']

itemList = itemList.loc[['B001170213'], :]

for i in itemList['SearchTerm']:
    #Run the code to get webresults
    WebResults = GetWebResults(i)
    wordlist = CleanWords(WebResults)
    
    #Count frequency on the wordlist
    counts = collections.Counter(wordlist)
    new_list = pd.DataFrame(counts.most_common(), columns=['Word', 'Frequency'])
    
    #Append to the new_list the information from the source data
    
    
    print(i)
    print(new_list)

### Conduct frequency counts on words
Count each and every word and then sort by word frequency

In [None]:
#Calculate the counts of keywords in the descriptions
counts = collections.Counter(wordlist)
new_list = pd.DataFrame(counts.most_common(), columns=['Word', 'Frequency'])

new_list[new_list['Frequency'] >= 5]

### Format the data for AI
Create a dataframe that contains the string of words and the SKU/Manufacturer.

In [None]:
#Spoof the Browser
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_equiv(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('http://www.google.com/')   

# do the query and return the text
br.select_form(name='f')   
br.form['q'] = 'Leica Microsystems 10450294 Leica Microsystems 10450294'
data = br.submit()
soup = BeautifulSoup(data.read(), "html5lib")

In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    #soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def CleanWords(listofWords):

    #Assign the WebResults to a new variable so I can run this code multiple times without having to scrape the web
    wordlist = ''.join(listofWords).lower().split()

    #Remove specific Characters from the wordlist
    wordlist = [i.replace(',',' ').replace('-',' ').replace('.',' ').replace('\t',' ') for i in wordlist]

    #Remove non-alphaNumeric and replace them with a space
    wordlist = [re.sub('[^ a-zA-Z0-9]+', ' ', i) for i in wordlist]

    #Remove double spaces and replace them with a single space
    wordlist = [re.sub('[  ]+', ' ', i) for i in wordlist]
    
    #Remove the product information from the keywords (we already know it, so what add it) and lowercase the list of words
    #wordlist = [i.lower() for i in wordlist if i.lower() not in SearchString.lower()]
    
    #remove specific stopwords from the list
    stopwords = ['...', '....', ' ','', 'the', 'in', 'to','of', 'and', 'at', 'for', 'you',
                 'with', 'is', '-', '/', 'on', 'case', 'a', 'website', 'registered', 'com',
                 'back', 'all', 'product', 'back', 'available', 'no', 'price']
    wordlist = [i.strip() for i in wordlist if i.strip() not in stopwords]

    #Lemmitize the words to remove pluralization
    wordnet_lemmatizer = WordNetLemmatizer()
    wordlist = [wordnet_lemmatizer.lemmatize(i) for i in wordlist]

    print('\nWordlist is cleaned and contains %i records\n' %len(wordlist))
    
    return wordlist

myWords = []
myLink = []
keywords = []
#Find the text of the first link and get the text
for i in soup.find_all("h3", {"class" : "r"}):
    myLink.append(i.get_text())
    
for i in myLink[0:5]:
    print i
    if not '.pdf' in str(br.find_link(text=i)):
        #r.find_link(text=i)
        req = br.click_link(text=i)
        soup = BeautifulSoup(br.open(req), "html5lib") 
        try:
            keywords.append(soup.find("meta", {"name" : "keywords"})['content'])
            myWords.append(text_from_html(soup))
            print('%s found keywords' % br.geturl())
        except:
            print('%s could not find keywords' % br.geturl())          

        
        br.back()

#for div in soup.findAll("h3", {"class" : "r"}):
#    for a in div.findAll('a'):
#        print a.get('href')[7:]

In [None]:
A = CleanWords(myWords)
counts = collections.Counter(A)
pd.DataFrame(counts.most_common(), columns=['Word', 'Frequency'])

In [None]:
soup.find_all("meta", {"name" : "^keywords"})

In [None]:
[i.lower() for i in str(''.join(soup))]