In [4]:
#Imports
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../data/cleaned/cleaned_data.csv", parse_dates=['event_time'])
df = df.sample(n=50000, random_state=5)



In [4]:
from sklearn.preprocessing import OrdinalEncoder

#Initialize the encoder
oe = OrdinalEncoder()
 
#Get just the hour of the event occurrence
df['event_hour'] = df['event_time'].dt.hour

#Extract & Encode
colsToEncode = ['event_type']
encodedCols = oe.fit_transform(df[colsToEncode])

#Add the encoded values to new columns
df['event_type_encoded'] = encodedCols[:, 0]



print(df.head())



                        event_time event_type  product_id  \
18380884 2019-10-22 05:20:20+00:00       view     3601485   
16999937 2019-10-20 13:54:08+00:00       view    12100456   
16260397 2019-10-19 17:34:11+00:00       view     1005160   
21484014 2019-10-25 15:05:38+00:00       view     1005101   
20900754 2019-10-25 02:34:55+00:00       view     1003312   

                  category_id              category_code   brand   price  \
18380884  2053013563810775923  appliances.kitchen.washer      lg  308.63   
16999937  2053013555816432043                kids.skates   kugoo  260.47   
16260397  2053013555631882655     electronics.smartphone  xiaomi  224.68   
21484014  2053013555631882655     electronics.smartphone  xiaomi  431.93   
20900754  2053013555631882655     electronics.smartphone   apple  690.73   

            user_id                          user_session  event_hour  \
18380884  534946088  b62bf744-b5e0-4c32-9f0d-e292feb295f6           5   
16999937  562176256  f939ecff-

In [5]:
#Purpose of this is to predict the price of an item given the time of day it is purchased
#So drop the price column
X = df.drop(columns=['price', 'user_id', 'user_session', 'category_code', 'brand','category_id', 'product_id', 'event_time', 'event_type'])
y = df[["price"]]

import scipy
from pyod.models.lof import LOF


#LOF model with 10 neighbors
lof = LOF(n_neighbors=10)

#Fit model
lof.fit(X)

#Extract labels and scores
labels = lof.labels_
scores = lof.decision_scores_



df['Outliers'] = labels

print("Outliers:", labels)
print("LOF scores:", scores)

numOutliers = (df['Outliers'] == 1).sum()
print(f"Total Number of Outliers: {numOutliers}")



Outliers: [0 0 0 ... 0 0 0]
LOF scores: [1. 1. 1. ... 1. 1. 1.]
Total Number of Outliers: 24


In [6]:
#Now lets try to predict anomalous prices per category, perhaps to represent luxury items.
#Initialize the encoder
oeTwo = OrdinalEncoder()
 

#Extract and Encode
encoded_category_code = oe.fit_transform(df[['category_code']])
encoded_brand = oe.fit_transform(df[['brand']])

#Put into new columns
df['encoded_category_code'] = encoded_category_code
df['encoded_brand'] = encoded_brand



In [26]:
#Same as before
X = df[['encoded_category_code', 'encoded_brand']]
y = df[["price"]] 

lof = LOF(n_neighbors=10)
lof.fit(X)


df['Outliers'] = labels

print("Outliers:", labels)
print("LOF scores:", scores)

#Get outliers
numOutliers = (df['Outliers'] == 1).sum()
print(f"Total Number of Outliers: {numOutliers}")


Outliers: [0 0 0 ... 0 0 0]
LOF scores: [1. 1. 1. ... 1. 1. 1.]
Total Number of Outliers: 24


In [None]:
#User behavior anomalies

oeThree = OrdinalEncoder()

encoded_user_id = oe.fit_transform(df[['user_id']])
encoded_user_session = oe.fit_transform(df[['user_session']])
df['encoded_user_id'] = encoded_user_id
df['encoded_user_session'] = encoded_user_session


X = df[['encoded_user_id', 'encoded_user_session']]

lof = LOF(n_neighbors = 10)
lof.fit(X)


labels = lof.labels_
scores = lof.decision_scores_

df['Outliers'] = labels

print("Outliers:", labels)#
print("LOF scores:", scores)

#Get outliers
numOutliers = (df['Outliers'] == 1).sum()
print(f"Total Number of Outliers: {numOutliers}")


Outliers: [0 0 0 ... 1 0 0]
LOF scores: [0.98873923 1.00135211 1.01834153 ... 1.07506541 1.04051256 1.01393246]
Total Number of Outliers: 5000


Local Outlier Factor (LOF) is the algorithm that was implemented on the dataset. Local Outlier Factor has 7 steps that should be focused on. These steps include calculating the distance between points, determining k and k-nearest neighbors, computing reachability distances, calculating the Local Reachability Density for each point, computing the LOF score for each point, and identifying outliers based on the score. 

	1.Calculate Distances
	    a. Measure the distance between each pair of points in the dataset
	    b. If you have an n-dimensional dataset, calculate the Euclidean distance between every pair of points 
	
    2.Determine k
	    a. Decide on a number k that will indicate the number of neighbors to consider for each point. 
	    b. Larger k-values work better with larger datasets and vice versa. 
	
    3.Find k-distance for each point
	    a. Sort the distances to all other points
	    b. Identify the distance to the kth nearest neighbor
	    c. The points within that distance are k-nearest neighbors 
	
    4.Computing Reachability Distance
	    a. Use this formula to compute the reachability distance 
	        i. P = point, N = k-nearest neighbor
	        ii. reachability distance(P,N) = max(k distance(N),distance(P,N))
	
    5.Calculate the Local Reachability Density
	    a. Use this formula to compute the Local Reachability Density
	        i. For each point P
	            1. Sum the reachability distances from P to each of its k-nearest neighbors
	            2. Divide k by this sum to get the Local Reachability Density
	            3. LRD(P) = k /∑N in k nearest neighbors of Preachability distance(P,N)k 
	
    6.Calculate the LOF Score
	    a. Find all the Local Reachability Densities of all P’s k-nearest neighbors
	    b. Use this formula to calculate the Local Outlier Factor for P 
	        i. Average the ratio of the LRDs of P’s neighbors to the Local Reachability Density of P 
	        ii. LOF(P) = 1/k N in k nearest neighbors of P∑ LRD(N)/LRD(P) 
	
    7.Identify Anomalies
	    a. Review the LOF scores
	    b. Set a threshold for outlet detection (for example, points with an LOF score of 1.5-2 or above can be considered outliers) 
	    c. Points with LOF scores that are above the set threshold can be outliers. 


There were definitely anomalies, only 24 each for the price-based ones, but 5000 for the user activity based ones.

I think for the first set of anomalies, the anomalies that were based on the price given brand attributes and the price of an item an action was done to at a time of day, it makes sense that they would be so low. For the price given brand attributes, it makes sense most brands would keep a consistent pricing structure. I thought some items may be varied in price, but it seems that variance is accounted for. As for the price relating to actions at a time of day, this one makes sense to have so few anomalies as well. The time of day you purchase an item wouldn't have much correlation with the price of the item you're chosing to buy.

It was interesting to see 5,000 anomalies (from a sample of 50,000) for determining oddities in user behavior. A few explanations is that for some users, a low amount of entries may account for skew. So if they only view, then finally bought (as if scouting out product), then it makes sense. New users would be especially susceptible to this. Only one deviant action would account for an outlier. Another possibility is that some users use their accounts far more often, where as some don't. It is possible some accounts also spend far more than others, due to larger households or perhaps for business purposes.