In [19]:
import time
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

In [20]:
def load_df(csv_path, nrows = None):
    json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path,
                     #converters are dict of functions for converting values in certain columns. Keys can either be integers or column labels.
                     #json.loads() method can be used to parse a valid JSON string and convert it into a Python Dictionary.
                     #It is mainly used for deserializing native string, byte, or byte array which consists of JSON data into Python Dictionary.
                     converters = {col: json.loads for col in json_cols},                                                                         
                         dtype = {'fullVisitorId': 'str'}, # Important!!
                         nrows = nrows)
    for col in json_cols:
        # for each column, flatten data frame such that the values of a single col are spread in different cols
        # This will use subcol as names of flat_col.columns
        flat_col = json_normalize(df[col])
        # Name the columns in this flatten data frame as col.subcol for tracability
        flat_col.columns = [f"{col}.{subcol}" for subcol in flat_col.columns]
        # Drop the json_col and instead add the new flat_col
        df = df.drop(col, axis = 1).merge(flat_col, right_index = True, left_index = True)
    return df


csv_test_path = './test_v2.csv'
test = load_df(csv_test_path, nrows = None)


In [21]:
import matplotlib.pyplot as plt
import json
import ast

inp = test.loc[2]
inp = inp['hits']
type(test['geoNetwork.country'])
cnt=0
sum=0
lowCountries=[]
hitWithCountry = {}
cnt = test['geoNetwork.country'].value_counts(sort=True)

for i in range(0, cnt.size):
    sum = sum + cnt[i]

#Identify the Non Operating Countries

for i in range(0, cnt.size):
    #print(f"Country Name: {cnt.index[i]} Frequency: {(cnt[i]/sum)*100}%" )
    if((cnt[i]/sum)*100 < 0.01):
        lowCountries.append(cnt.index[i]) 
    

inputCountries = test.at[0,'geoNetwork.country']

test['totals.hits'] = pd.to_numeric(test['totals.hits'])
mean = test['totals.hits']. mean()
std = test['totals.hits']. std()
threshold = mean + 2.0*std

for indx in range(test.index.start, test.index.stop):
    country = test.at[indx, 'geoNetwork.country']
    oneIndex = (test.at[indx,'hits'])
    hitsInfo = ast.literal_eval(oneIndex)
    #print(hitsInfo)
    if country in lowCountries:
        hit = test.at[indx, 'totals.hits']
        if int(hit) >= threshold:
            print(f"Index Number : {indx} Anomalous traffic Info: {country} Total Hits : {int(hit)}")
            for ind in range(0, len(hitsInfo)):
                print(f"Hit Number: {hitsInfo[ind]['hitNumber']} Visited Page: {hitsInfo[ind]['appInfo']['screenName']} Keyword: {test.at[indx, 'trafficSource.keyword']}")
        #print(f"Country Name: {country} --> Total hits: {hit}")
        if country in hitWithCountry.keys():
            hitWithCountry[country] += 1
        else:
            hitWithCountry[country]=1

#newArr = test['hits'].to_numpy()


Index Number : 4867 Anomalous traffic Info: Papua New Guinea Total Hits : 47
Hit Number: 1 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 2 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 3 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 4 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 5 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 6 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 7 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 8 Visited Page: shop.googlemerchandisestore.com/google+redesig

Index Number : 23066 Anomalous traffic Info: Namibia Total Hits : 31
Hit Number: 1 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 2 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 3 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 4 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 5 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 6 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube/quickview Keyword: (not set)
Hit Number: 7 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube Keyword: (not set)
Hit Number: 8 Visited Page: shop.googlemerchandisestore.com/google+redesign/shop+b

KeyboardInterrupt: 

In [24]:
import ast
oneIndex = (test.at[0,'hits'])
result = ast.literal_eval(oneIndex)
#print(type(result))
for ind in range(0, len(result)):
    print(f"Hit Number: {result[ind]['hitNumber']} Visited Page: {result[ind]['appInfo']['screenName']}")

Hit Number: 1 Visited Page: shop.googlemerchandisestore.com/home
Hit Number: 2 Visited Page: shop.googlemerchandisestore.com/google+redesign/accessories
Hit Number: 3 Visited Page: shop.googlemerchandisestore.com/home
Hit Number: 4 Visited Page: shop.googlemerchandisestore.com/home


In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats

test['totals.pageviews'] = pd.to_numeric(test['totals.pageviews'])
meanPageView = test['totals.pageviews'].mean()
stdPageView = test['totals.pageviews'].std()
#print(test['totals.pageviews'].min())
## Rule based on the Total PageView
# for i in range(len(test)):
#     pageView = test.loc[i , 'totals.pageviews']
#     if pageView > meanPageView*3*stdPageView:
#         visitorId = test.loc[i, 'fullVisitorId']

# # Calculating probability density function (PDF) and plot normal curve
# pdf = stats.norm.pdf(test['totals.pageviews'].sort_values(), meanPageView, stdPageView)
# plt.plot(test['totals.pageviews'].sort_values(), pdf)
# plt.xlim([0,500])  
# plt.xlabel("Page Views", size=12)    
# plt.ylabel("Frequency", size=12)                
# plt.grid(True, alpha=0.3, linestyle="--")
# plt.show()



In [None]:
for i,j in zip(test.fullVisitorId.duplicated().index, test.fullVisitorId.duplicated()):
    if j == True:
        print(test.loc[i, 'fullVisitorId'])

In [None]:
sum = 0
test['totals.hits'] = pd.to_numeric(test['totals.hits'])
mean = test['totals.hits'].mean()
std = test['totals.hits'].std()
maximum = test['totals.hits'].max()
print(f"mean is {mean} and std is {std} and max is {maximum}")

for i in test['totals.hits']:
    if i == 500:
        print(sum)
        sum = sum+1
plt.hist(test['totals.hits'])
#plt.xlim(xmin=0, xmax = 501)
plt.show()

In [23]:
test.head(1)

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.medium,trafficSource.keyword,trafficSource.adContent,trafficSource.isTrueDirect,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,7460955084541987166,"[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,1526099341,2,1526099341,Chrome,...,organic,(not provided),(not set),True,not available in demo dataset,,,,,
