In [199]:
import pandas as pd
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from IPython.core.display import display, HTML
import os
import numpy as np
import itertools
from dotenv import load_dotenv, find_dotenv

### Open the locally stored api key

In [2]:
_ = load_dotenv(find_dotenv())

In [3]:
key = os.environ.get("API_KEY")
base_url = os.environ.get("API_URL")

### Load in the users and post history tables

In [51]:
base_url = 'http://bitnami-dreamfactory-b106.cloudapp.net/api/v2/venndor_test/_table/'

users = base_url + 'test_users' + '?api_key=%s' % key
matches = base_url + 'test_matches' + '?api_key=%s' % key
items = base_url + 'test_items' + '?api_key=%s' % key

### Do the url request, converting to JSON

In [52]:
# Generating users JSON
users_response = requests.get(users)
users_json = users_response.json()

# Generating matches JSON
matches_response = requests.get(matches)
matches_json = matches_response.json()

# Generating items JSOn
items_response = requests.get(items)
items_json = items_response.json()

### Creating the users DF and displaying top 5 results

In [42]:
df_users = pd.DataFrame(
            users_json[list(users_json.keys())[0]]
                        )

display(HTML(df_users.head(1).to_html()))

Unnamed: 0,_id,ageRange,blockedBy,blockedUsers,bookmarks,email,facebookID,first_name,gender,hasSeenWalkthrough,joinedDate,last_name,matches,phoneNumber,postedItems,profilePictureURL,pushID
0,587fd7290a832f0b2c46a3c1,Optional(21)-nil,[],[],[],jonpeplonsko@gmail.com,104369026725093,Jon,male,True,1484773000.0,Peplonsko,[],,"{'587ffead0a832f0b2c46a3c3': 1484783278.1936, ...",https://scontent.xx.fbcdn.net/v/t1.0-1/s200x20...,f91334db-d7a4-4c8e-8d7d-1942d1faeeb0


In [43]:
df_users = df_users.loc[df_users['matches'].apply(lambda x: len(x)) > 0]

In [78]:
id_vect = []
item_vect = []
match_vect = []

for user_id, item in zip(df_users['_id'], df_users['matches'].values):
    for match_id, item_id in item.items():
        id_vect.append(user_id)
        item_vect.append(item_id)
        match_vect.append(match_id)
        
df_user_matched_items = pd.DataFrame({'id': id_vect,
                                      'item': item_vect,
                                      'match': match_vect})

In [79]:
df_matches = pd.DataFrame(
            matches_json[list(matches_json.keys())[0]]
                        )

df_items = pd.DataFrame(
            items_json[list(items_json.keys())[0]]
                        )

In [91]:
df_user_zero = df_user_matched_items.set_index([[0]*len(df_user_matched_items)])
df_items_zero = df_items.set_index([[0]*len(df_items)])

combined_items = df_user_zero.join(df_items_zero['_id'], how='outer')
combined_items = combined_items.reset_index(drop=True)
combined_items = combined_items.drop('item', axis=1)

In [93]:
combined_items.columns = ['id', 'match', 'item']

In [100]:
combined_items['match'] = np.nan

In [104]:
combined_items = combined_items.loc[~combined_items['item'].isin(df_user_matched_items['item'])].reset_index(drop=True)

In [105]:
df_user_matched_items = df_user_matched_items.append(combined_items)

In [111]:
df_user_matched_items = df_user_matched_items.sort_values(by=['id', 'match'])

In [213]:
df_user_items = df_user_matched_items.merge(df_items,
                                            how='outer',
                                            left_on='item',
                                            right_on='_id')

In [214]:
df_user_items = df_user_items.drop(['_id',
                                    'avgOffer',
                                    'bookmarkees',
                                    'bought',
                                    'boughtMatchId',
                                    'buyerID',
                                    'locationDescription',
                                    'locationName',
                                    'matches',
                                    'ownerID',
                                    'searchName',
                                    'thumbnailDataStrings',
                                    'xCoordinate',
                                    'yCoordinate'], axis=1)

In [215]:
df_user_items['offersMade'] = df_user_items['offersMade'].apply(lambda x: np.shape(x))

In [216]:
offer_made_vect = []

for val in df_user_items['offersMade']:
    try:
        offer_made_vect.append(val[0])
    except:
        offer_made_vect.append(np.nan)
df_user_items['offersMade'] = offer_made_vect

In [217]:
df_user_items = df_user_items.set_index('id')

In [218]:
df_user_items.loc[df_user_items['match'].isnull(), 'match_bool'] = -1
df_user_items.loc[~df_user_items['match'].isnull(), 'match_bool'] = 1

In [219]:
df_user_items['product_descript'] = df_user_items['name'] + ' ' + df_user_items['details']

In [220]:
df_user_items['product_descript'] = df_user_items['product_descript'].str.split()
df_user_items['product_descript'] = df_user_items['product_descript'].str.join(' ')

In [223]:
df_user_items

Unnamed: 0_level_0,item,match,boughtPrice,category,details,minPrice,name,nuMatches,nuSwipesLeft,nuSwipesRight,offersMade,photoCount,postedDate,match_bool,product_descript
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
587fd8710a832f0b2e4c5141,58815c690a832f0b2b3cf8a8,588237e90a832f3b361910c2,,Clothing,•\tBrand New in Original Packaging\n\n•\tVario...,8.0,Wool Winter Gloves,2.0,100.0,1.0,3.0,2.0,1.484873e+09,1.0,Wool Winter Gloves • Brand New in Original Pac...
587fee820a832f0b2e4c5143,58815c690a832f0b2b3cf8a8,58816ecf0a832f27b35d94d7,,Clothing,•\tBrand New in Original Packaging\n\n•\tVario...,8.0,Wool Winter Gloves,2.0,100.0,1.0,3.0,2.0,1.484873e+09,1.0,Wool Winter Gloves • Brand New in Original Pac...
587fd8710a832f0b2e4c5141,588eb2960a832f033e48a4f3,5890a7b80a832f6e3621e475,,Household,Ex library copy,6.0,Leah Remini- Troublemaker,1.0,99.0,2.0,2.0,1.0,1.485747e+09,1.0,Leah Remini- Troublemaker Ex library copy
587fd8710a832f0b2e4c5141,588b7bc40a832fea5a5037b1,5890ac9d0a832f072b3f7475,,Kitchen,Brand spanking new!\n5-cup capacity\nUnused fi...,10.0,Brits Filter,2.0,99.0,4.0,4.0,1.0,1.485536e+09,1.0,Brits Filter Brand spanking new! 5-cup capacit...
5889305d0a832f53fd2649e9,588b7bc40a832fea5a5037b1,588e0cae0a832f09f8385681,,Kitchen,Brand spanking new!\n5-cup capacity\nUnused fi...,10.0,Brits Filter,2.0,99.0,4.0,4.0,1.0,1.485536e+09,1.0,Brits Filter Brand spanking new! 5-cup capacit...
587fd8710a832f0b2e4c5141,58876d7d0a832f1aed0cd3d1,5890acb10a832f072b3f7476,,Textbooks,Good read \nAuthor: Jeff Jarvis\nBook in excel...,5.0,What would Google do,4.0,99.0,7.0,6.0,2.0,1.485270e+09,1.0,What would Google do Good read Author: Jeff Ja...
587fe4c60a832f0b2e4c5142,58876d7d0a832f1aed0cd3d1,5887e5bc0a832f520a6ad603,,Textbooks,Good read \nAuthor: Jeff Jarvis\nBook in excel...,5.0,What would Google do,4.0,99.0,7.0,6.0,2.0,1.485270e+09,1.0,What would Google do Good read Author: Jeff Ja...
588392320a832f726a294342,58876d7d0a832f1aed0cd3d1,5888dfc90a832f2415731f76,,Textbooks,Good read \nAuthor: Jeff Jarvis\nBook in excel...,5.0,What would Google do,4.0,99.0,7.0,6.0,2.0,1.485270e+09,1.0,What would Google do Good read Author: Jeff Ja...
5890015b0a832fdfdd616021,58876d7d0a832f1aed0cd3d1,589001c00a832f09b21b68b3,,Textbooks,Good read \nAuthor: Jeff Jarvis\nBook in excel...,5.0,What would Google do,4.0,99.0,7.0,6.0,2.0,1.485270e+09,1.0,What would Google do Good read Author: Jeff Ja...
587fd8710a832f0b2e4c5141,5880c8990a832f0b2b3cf8a2,,45.0,Furniture,Excellent condition\nAbout 3.5'x2',40.0,Glass Coffee Table,1.0,7.0,3.0,2.0,3.0,1.484835e+09,-1.0,Glass Coffee Table Excellent condition About 3...


In [234]:
count_vect = CountVectorizer(stop_words='english',
                             )
text_matrix = count_vect.fit_transform(df_user_items.loc[~df_user_items['product_descript'].isnull(), 'product_descript'])

In [235]:
df_text_matrix = pd.DataFrame(text_matrix.toarray(),
                              index=df_user_items.index[~df_user_items['product_descript'].isnull()],
                              columns=count_vect.get_feature_names())

In [236]:
df_text_matrix

Unnamed: 0_level_0,10,10th,150ml,16oz,18,182cm,20,26w,28,293,...,watch,wet,white,windproof,winter,woo,worth,wrist,x12,x2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
587fd8710a832f0b2e4c5141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
587fee820a832f0b2e4c5143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
587fd8710a832f0b2e4c5141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587fd8710a832f0b2e4c5141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5889305d0a832f53fd2649e9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587fd8710a832f0b2e4c5141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587fe4c60a832f0b2e4c5142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588392320a832f726a294342,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5890015b0a832fdfdd616021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587fd8710a832f0b2e4c5141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
