In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

In [None]:
import os
import time

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl

import numpy as np
np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=4)

import pandas as pd
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 50)
#pd.set_option('precision', 5)

from datetime import datetime
from dateutil.parser import parse

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords
import string
stopWords = stopwords.words('english') + list(string.punctuation)

from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from gensim.models import word2vec

In [None]:
os.chdir('/Users/zacklarsen/Dropbox/Inference Analytics Team Folder/Zack Work')
!ls *csv

In [None]:
## Load in dataset and clean up

OR = pd.read_csv('OR.csv',dtype={'CustomerID': str})
OR.drop('Unnamed: 0',axis=1,inplace=True) # Remove annoying index that excel creates
OR['Date'] = pd.to_datetime(OR['Date']) # Convert date to datetime
OR['Week'] = OR['Date'].dt.week # Obtain week number
OR = OR[~OR['CustomerID'].isnull() ]

## Exploratory data analysis on OR

In [None]:
## Week with highest number of StockCodes

OR.groupby(['Week']).count().StockCode.idxmax()

In [None]:
## Most popular item per week

SC = pd.DataFrame(OR.groupby(['Week','StockCode']).StockCode.count())

In [None]:
SCDF = SC.unstack(level=0,fill_value=0)

In [None]:
SCDF

In [None]:
SCDF.sum(axis=0)

In [None]:
SC.max()

In [None]:
OR[OR['StockCode'] == '22086']

In [None]:
## 1 Number of items per order
IPO = pd.DataFrame(OR.groupby(['CustomerID','InvoiceNo'])['Quantity'].sum()).reset_index()

## 2 Number of DISTINCT items per order
DIPC = pd.DataFrame(OR.groupby(['CustomerID','InvoiceNo'])['Quantity'].count()).reset_index()

## 3 Number of orders per customer
OPC = pd.DataFrame(OR.groupby(['CustomerID'])['InvoiceNo'].count()).reset_index()

## 4 Total amount per invoice
TOT = pd.DataFrame(OR.groupby(['InvoiceNo'])['Total Amount'].sum())
TOT.sort_values('Total Amount',ascending=False,inplace=True)
## After inspection, it seems like any orders over $1,500 are outliers, so let's remove them
TOT = TOT[TOT['Total Amount'] <1501]

In [None]:
ax = sns.boxplot(x=TOT["Total Amount"])

In [None]:
sns.distplot(TOT["Total Amount"])

In [None]:
sns.violinplot(TOT["Total Amount"]);

In [None]:
sns.distplot(IPO.Quantity);

In [None]:
sns.stripplot(x="CustomerID", y="InvoiceNo", data=OPC);

## From the above plot, it looks like most customers have fewer than 1000 orders. Let's remove outliers

In [None]:
#OPC.sort_values('InvoiceNo',ascending=False)

Outliers = OPC[OPC['InvoiceNo'] > 1000].CustomerID.values

In [None]:
Outliers

In [None]:
OPC

In [None]:
IPO

In [None]:
sns.distplot(DIPC.Quantity)

In [None]:
g = sns.distplot(OPC.InvoiceNo,bins=1000)
plt.xlim(0,300)

## Wordclouds

In [None]:
from wordcloud import WordCloud

In [None]:
## Plotting total amount per order per week

data = OR.loc[:,['Week','Total Amount']]
data.sort_values('Week',inplace=True)
data.set_index('Week',inplace=True) 


data['Total Amount'].plot(figsize=(16, 12))  

In [None]:
## Plotting number of orders per weekday

data = pd.DataFrame(OR.groupby(['D_O_W'])['InvoiceNo'].nunique()).reset_index()

data.sort_values('D_O_W',inplace=True)

data.set_index('D_O_W',inplace=True) 

data['InvoiceNo'].plot(figsize=(16, 12))  

In [None]:
## Plotting number of orders per week

data = pd.DataFrame(OR.groupby(['Week'])['InvoiceNo'].nunique()).reset_index()

data.sort_values('Week',inplace=True)

data.set_index('Week',inplace=True) 

data['InvoiceNo'].plot(figsize=(16, 12))  

## Word2vec

### https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

### https://blog.manash.me/how-to-use-pre-trained-word-vectors-from-facebooks-fasttext-a71e6d55f27

### http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [None]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

# Pick a word 
find_similar_to = 'Hogwarts'

# Finding out similar words [default= top 10]
for similar_word in model.similar_by_word(find_similar_to):
    print(similar_word[0], similar_word[1])


# Pick a word 
find_similar_to = 'pain'

# Finding out similar words [default= top 10]
for similar_word in model.similar_by_word(find_similar_to):
    print(similar_word[0], similar_word[1])


# Pick a word 
find_similar_to = 'obama'

# Finding out similar words [default= top 10]
for similar_word in model.similar_by_word(find_similar_to):
    print("Related word: {0}  ,Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

# Getting the tokens 
words = []
for word in model.vocab:
    words.append(word)

    
# Printing out number of tokens available
print("Number of Tokens: {}".format(len(words)))

# Printing out the dimension of a word vector 
print("Dimension of a word vector: {}".format(
    len(model[words[0]])
))



# # Print out the vector of a word 
# print("Vector components of a word: {}".format(
#     model[words[0]]
# ))

## GloVe

In [None]:
## Load in GloVe files by first changing current directory
os.chdir('/Users/zacklarsen/Dropbox/Inference Analytics Team Folder/Zack Work/Glove.6B/')
#!ls



from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)


from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.300d.txt.word2vec'
Glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)

## Reset current directory
os.chdir('/Users/zacklarsen/Dropbox/Inference Analytics Team Folder/Zack Work')

# Pick a word 
find_similar_to = 'obama'

# Finding out similar words [default= top 10]
for similar_word in Glove_model.similar_by_word(find_similar_to):
    print(similar_word[0], similar_word[1])


# Pick a word 
find_similar_to = 'trump'

# Finding out similar words [default= top 10]
for similar_word in Glove_model.similar_by_word(find_similar_to):
    print(similar_word[0], similar_word[1])


# Pick a word 
find_similar_to = 'wiki'

# Finding out similar words [default= top 10]
for similar_word in Glove_model.similar_by_word(find_similar_to):
    print(similar_word[0], similar_word[1])


## Load in larger GloVe crawled from web

In [None]:
## Load in GloVe files by first changing current directory
os.chdir('/Users/zacklarsen/Dropbox/Inference Analytics Team Folder/Zack Work/')
#!ls



from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.840B.300d.txt'
word2vec_output_file = 'glove.840B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)


from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.840B.300d.txt.word2vec'
Glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)