In [2]:
import pandas as pd
import yaml
from sqlalchemy import create_engine
import psycopg2
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from unidecode import unidecode

from lib.database_module import connect_to_postgres, create_or_update_category_in_database,\
        create_or_update_page_in_database, select_all_page_vectors,select_pages,\
        select_page_vectors, select_all_category_vectors, select_category_vectors,\
        select_statement, execute_sql_statement
from lib.wiki_module import param_query, cat_param_id, cat_param_pages,\
        page_params, text_cleaner, get_summary, query_category, query_page,\
        wikipedia_get_id
from lib.encoding_module import build_vectorizer, get_page_vector,\
        get_searchterm_vector, get_cat_vector

## First Pipeline

In [3]:
## TODO: Create bash execcutive file and write a function to receive command line passed args

In [4]:
## TODO: Query categories from Wikipedia API
category = 'Automotive technologies'
que_categ = query_category(category)
que_categ

{'categoryid': '859604',
 'pages': [{'pageid': 47155585, 'title': '6x4 (drivetrain)'},
  {'pageid': 7446779, 'title': 'AC Propulsion eBox'},
  {'pageid': 282973, 'title': 'AC Propulsion tzero'},
  {'pageid': 8036463, 'title': 'Airless tire'},
  {'pageid': 6153488, 'title': 'Alcantara (material)'},
  {'pageid': 46172623, 'title': 'All-wheel drive'},
  {'pageid': 38464210, 'title': 'Appleton spotlight'},
  {'pageid': 4723131, 'title': 'Articulated vehicle'},
  {'pageid': 905356, 'title': 'Automobile accessory power'},
  {'pageid': 33916504, 'title': 'Automobile air conditioning'},
  {'pageid': 851704, 'title': 'Car platform'},
  {'pageid': 2378039, 'title': 'Automotive Electronics Council'},
  {'pageid': 44739087, 'title': 'Automotive head-up display'},
  {'pageid': 11814346, 'title': 'Automotive paint'},
  {'pageid': 32551802, 'title': 'Automotive Technician Accreditation'},
  {'pageid': 245926, 'title': 'Autonomous car'},
  {'pageid': 52835852, 'title': 'Autotech'},
  {'pageid': 252243

In [7]:
## TODO: Fromat retrievd data and prepare them to be written to database and extract page_ids

In [8]:
categid = que_categ['categoryid']

In [9]:
connect_to_postgres (location = 'remote')

Connected to server joshuacook.me.


(<connection object at 0x000000000C2C7300; dsn: 'user=ali password=xxxxx obtain half camp' host=joshuacook.me port=5432 dbname=project_6', closed: 0>,
 <cursor object at 0x000000000BE49558; closed: 0>)

In [7]:
create_or_update_category_in_database(categid, category)

Connected to server joshuacook.me.


'OK'

In [10]:
## TODO: Write data to the database category table
## TODO: Use the page_ids to get page data from Wikipedia API
## TODO: Clean and format the data
## TODO: Write data to database page table and ecncoded data to page_vec table
pages = []

for line in que_categ['pages']:
    pid = str(line['pageid'])
    ptitle = line['title']
    ptext = query_page(pid)['text']
    row = [pid, ptitle, ptext]
    pages.append(row)
    create_or_update_page_in_database(pid, categid, ptitle, ptext, location = 'remote')

In [13]:
pages_df = pd.DataFrame(pages, columns=['pid','ptitle','ptext'])
pages_df.head()

Unnamed: 0,pid,ptitle,ptext
0,47155585,6x4 (drivetrain),A 6×4 drivetrain (six-by-four) is a vehicle wi...
1,7446779,AC Propulsion eBox,The eBox is a conversion of a Scion xB hatchba...
2,282973,AC Propulsion tzero,The tzero is a handmade electric sports car de...
3,8036463,Airless tire,"Non-pneumatic tires (NPT), or Airless tires, a..."
4,6153488,Alcantara (material),Alcantara is a covering material manufactured ...


In [15]:
## TODO: Encode page text
vectorizer = build_vectorizer(pages_df['ptext'])
vectorizer

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...TruncatedSVD(algorithm='randomized', n_components=500, n_iter=10,
       random_state=42, tol=0.0))])

In [21]:
encoded_pages = get_page_vector(vectorizer,pages_df['ptext'],pages_df['pid'])

## Second Pipeline

In [22]:
from sklearn.externals import joblib
transformer = joblib.load('vectorizer.pkl')

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [24]:
## TODO: Create bash execcutive file and write a function to receive command line passed args
search_arg = 'A hydrogen vehicle is a vehicle that uses hydrogen as\
            its onboard fuel for motive power. Hydrogen vehicles include hydrogen fueled space rockets,\
            as well as automobiles and other transportation vehicles.\
            The power plants of such vehicles convert the chemical energy of \
            hydrogen to mechanical energy either by burning hydrogen in an internal combustion engine,\
            or by reacting hydrogen with oxygen in a fuel cell to run electric motors. Widespread use\
            of hydrogen for fueling transportation is a key element of a proposed hydrogen economy.'

In [25]:
## TODO: Encode passed args
search_arg_encode = get_searchterm_vector(transformer, search_arg)
search_arg_encode

{'A hydrogen vehicle is a vehicle that uses hydrogen as            its onboard fuel for motive power. Hydrogen vehicles include hydrogen fueled space rockets,            as well as automobiles and other transportation vehicles.            The power plants of such vehicles convert the chemical energy of             hydrogen to mechanical energy either by burning hydrogen in an internal combustion engine,            or by reacting hydrogen with oxygen in a fuel cell to run electric motors. Widespread use            of hydrogen for fueling transportation is a key element of a proposed hydrogen economy.': array([[  7.71210052e-02,   2.33732730e-03,  -3.38348852e-02,
           1.60860418e-02,   9.80933282e-03,  -1.12210112e-02,
           5.05896957e-03,  -9.86284252e-04,   1.28230660e-02,
          -7.21929046e-03,   6.72179897e-02,  -7.43030473e-02,
           1.63723908e-01,   3.67728670e-02,  -8.23670136e-02,
           1.47453697e-02,  -1.17181486e-02,  -4.95129241e-03,
          -4.5

In [26]:
search_arg_vector = search_arg_encode.values()[0][0]
search_arg_vector[0]

0.077121005198877379

In [27]:
## TODO: Retrieve encoded page data for all the pages in the database
page_vectors = select_all_page_vectors()

Connected to server joshuacook.me.


In [28]:
len(page_vectors)

2270

In [29]:
page_vectors_array = []
for page in page_vectors:
    page_vectors_array.append(page[1])

In [33]:
cosine_similarity_list = []
for page in page_vectors:
    temp = []
    pageid = page[0]
    page_vector = page[1]
    cosine_ = cosine_similarity(search_arg_vector,page_vector)
    temp = [pageid, cosine_[0][0]]
    cosine_similarity_list.append(temp)
cosine_similarity_list[0:1]



[[4341789, 0.00013467130925290394]]

In [34]:
cosine_similarity_df = pd.DataFrame(cosine_similarity_list)
cosine_similarity_df.sort_values(1, axis=0,ascending=False).head()

Unnamed: 0,0,1
1772,188545,0.961456
1551,20242197,0.758816
1811,2237309,0.567178
2078,342520,0.470298
2099,23166019,0.450056


In [59]:
## TODO: Query the database for text of the 5 selected articles
Top_5_pages = cosine_similarity_df.sort_values(1, axis=0,ascending=False)[0].head()
Top_5_pages

1772      188545
1551    20242197
1811     2237309
2078      342520
2099    23166019
Name: 0, dtype: int64

In [60]:
Top_5_pages_text = select_pages(Top_5_pages,location = 'remote')
pd.DataFrame(Top_5_pages_text)

Connected to server joshuacook.me.


Unnamed: 0,0,1,2
0,188545,Hydrogen vehicle,A hydrogen vehicle is a vehicle that uses hydr...
1,342520,Fuel efficiency,Fuel efficiency is a form of thermal efficienc...
2,2237309,Cohesion (chemistry),"Cohesion (from Latin cohaerere ""stick or stay ..."
3,20242197,Mazda RX-8 Hydrogen RE,The Mazda RX-8 Hydrogen RE is a 2003 bi-fuel v...
4,23166019,Triple-hybrid,Triple-hybrid is a registered trademark (Germa...


## Third Pipeline

In [None]:
## TODO: Query the database for all page vectors

In [35]:
from lib.database_module import select_statement, execute_sql_statement

In [36]:
page_vect_categ = execute_sql_statement(sql_select="""select p.page_id, p.page_vec, c.category_name, c.category_id
                                                        from page_vec as p, page_cate as pc,
                                                        category as c where p.page_id = pc.page_id and 
                                                        pc.category_id = c.category_id""")

OK
Connected to server joshuacook.me.


In [42]:
page_vect_categ_df = pd.DataFrame(page_vect_categ, columns=["page id", "page vec", "category id", "category name"])

In [43]:
page_vect_categ_df.head()

Unnamed: 0,page id,page vec,category id,category name
0,19572217,"[0.118021158893, 0.161339466163, 0.00053315610...",influenza,2932559
1,82425,"[0.101142240117, 0.161844143872, 0.05078797791...",Sandwiches,757471
2,33686134,"[0.052467255444, 0.11182426655, 0.042842444269...",Sandwiches,757471
3,49033306,"[0.0201404575061, 0.0363450596613, 0.011798183...",Sandwiches,757471
4,2546911,"[0.0580031020169, 0.0635550969954, 0.020313225...",Sandwiches,757471


In [44]:
len(page_vect_categ_df)

2386

In [45]:
page_vect_categ_df.drop_duplicates("page id", inplace=True)
len(page_vect_categ_df)

2270

In [46]:
## TODO: Query the database for all category vectors
categ_vectors = select_all_category_vectors()

Connected to server joshuacook.me.


In [47]:
import numpy as np

In [48]:
np.unique(pd.factorize(page_vect_categ_df["category id"])[0])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17])

In [49]:
## TODO: Build and train a model using page vectors as X and category vectors as Y
page_vect_categ_df["factorized categid"] = pd.factorize(page_vect_categ_df["category id"])[0]

In [50]:
page_vect_categ_df.head()

Unnamed: 0,page id,page vec,category id,category name,factorized categid
0,19572217,"[0.118021158893, 0.161339466163, 0.00053315610...",influenza,2932559,0
1,82425,"[0.101142240117, 0.161844143872, 0.05078797791...",Sandwiches,757471,1
2,33686134,"[0.052467255444, 0.11182426655, 0.042842444269...",Sandwiches,757471,1
3,49033306,"[0.0201404575061, 0.0363450596613, 0.011798183...",Sandwiches,757471,1
4,2546911,"[0.0580031020169, 0.0635550969954, 0.020313225...",Sandwiches,757471,1


In [51]:
X = list(page_vect_categ_df['page vec'])
y = list(page_vect_categ_df['factorized categid'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [52]:
classifier.score(X_test, y_test)

0.85739436619718312

In [53]:
## TODO: Pickle the built model
categ_ID = page_vect_categ_df[['category name', 'factorized categid']].drop_duplicates()

In [54]:
joblib.dump((classifier, categ_ID), 'data/model.pkl')

['data/model.pkl']

## Forth Pipeline

In [None]:
## TODO: Create bash execcutive file and write a function to receive command line passed args

In [9]:
## TODO: Query the passed article from Wikipedia API
article = 'Apple sauce'
article_id = wikipedia_get_id (article)
article_id = article_id[0]
article_id

u'46484'

In [10]:
article_dict = query_page(article_id)
article_dict

OrderedDict([('pageid', 46484),
             ('summary',
              u'Apple sauce or applesauce is a sauce made of apples. It can be made with peeled or unpeeled apples and a variety of spices (commonly cinnamon and allspice)'),
             ('text',
              u'Apple sauce or applesauce is a sauce made of apples. It can be made with peeled or unpeeled apples and a variety of spices (commonly cinnamon and allspice). Flavorings or sweeteners such as sugar or honey are also commonly added. Apple sauce is inexpensive and is widely used in the United Kingdom, North America and some European countries. It can be substituted for fat (e.g. butter oil) in baking. Commercial versions of apple sauce are readily available in supermarkets. It may be packaged in several ways, including: glass jars, tins, or plastic tubs. It is also sold in serving-size small plastic cups. Preparation Apple sauce is made by cooking down apples with water or apple cider (fresh apple juice) to the required leve

In [15]:
article_text = article_dict['text']

In [21]:
## TODO: Encode the page data
vectorizer = joblib.load('./vectorizer.pkl')

article_vect = vectorizer.transform([unidecode(article_text)])[0]

In [24]:
## TODO: Load the pickled model and use it to predict the category using current page vector as X

transformer, categ_ID = joblib.load('data/model.pkl')

In [25]:
prediction = transformer.predict(article_vect.reshape(1,-1))[0]
prediction

14

In [55]:
categ_ID

Unnamed: 0,category name,factorized categid
0,2932559,0
1,757471,1
160,24580905,2
162,696445,3
167,691185,4
181,27573876,5
232,1056532,6
250,710421,7
251,716903,8
252,695196,9
