# Feature Engineering
## Loading Libraries & Datasets

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math
import matplotlib.pyplot as plt
from datetime import datetime
import re
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
item_metadata_filepath = '../raw_data/item_metadata.csv'
item_metadata = pd.read_csv(item_metadata_filepath)

train_filepath = '../clean_data/train.csv'
train = pd.read_csv(train_filepath)

## Item Global Features
### Number of Properties

In [3]:
item_metadata.properties = item_metadata.properties.apply(lambda x: x.split('|'))
item_metadata['NumberOfProperties'] = item_metadata.properties.apply(lambda x: len(x))
item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties
0,5101,"[Satellite TV, Golf Course, Airport Shuttle, C...",62
1,5416,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46
2,5834,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40
3,5910,"[Satellite TV, Sailing, Cosmetic Mirror, Telep...",41
4,6066,"[Satellite TV, Sailing, Diving, Cosmetic Mirro...",85


In [12]:
#getting total number of unique properties across all items
AllPropertiesList = item_metadata.properties.tolist()

AllPropertiesFlatList = []
for sublist in AllPropertiesList:
    for item in sublist:
        AllPropertiesFlatList.append(item)
        
print('Number of unique properties is', len(set(AllPropertiesFlatList)))

Number of unique properties is 157


### Items Properties Similarities
The purpose is to get cosine similarity between items. The maximum 25 items out of the list can be extracted as a dataframe and cosine similarity can be obtained in order to get the similar items to the ones the user had interacted with to be listed on the top of the list.  

In [6]:
item_metadata.properties = item_metadata.properties.apply(lambda x: tuple(x))

one_hot = MultiLabelBinarizer()

properties_encoded = one_hot.fit_transform(item_metadata.properties.values.tolist())

properties_encodedDF = pd.DataFrame(properties_encoded)

#changing column names
properties_list = one_hot.classes_.tolist()
for i in range(len(properties_list)):
    properties_encodedDF = properties_encodedDF.rename(columns={i:properties_list[i]})

#creating a column of the item id to get the similarity between items
item_metadata.item_id = item_metadata.item_id.apply(lambda x: str(x))
properties_encodedDF['item_id'] = item_metadata.item_id

properties_encodedDF.head()

Unnamed: 0,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,Air Conditioning,Airport Hotel,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,0,0,0,1,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,1,1
1,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,1
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,1,1,0,0,0,...,1,0,1,0,1,0,0,1,1,1


In [137]:
cosine_similarity(properties_encodedDF.set_index('item_id').iloc[0:25])[0]

array([1.        , 0.69283044, 0.6024145 , 0.69419307, 0.74385528,
       0.62604751, 0.7220817 , 0.75202407, 0.72586619, 0.77491695,
       0.68469194, 0.56796183, 0.65036141, 0.7573052 , 0.68861713,
       0.73636183, 0.5528638 , 0.66628253, 0.65991202, 0.68250015,
       0.74691014, 0.756971  , 0.57473697, 0.61833711, 0.5265603 ])

It needs to be discussed for further implementation.

## Number of Times in Impressions
The purpose of this feature is to check how many times an item had been shown to users in the list.

In engineer global features from the train dataset, which will be most of the cases around either clickouts or final click out examples (instances), two dataframes of them will be created.

In [3]:
train.drop(columns='Unnamed: 0', inplace=True)
ClickoutDF = train[train.action_type=='clickout item']
ClickoutDF.head(1)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
13,00RL8Z82B2Z1,aff3928535f48,1541037543,14,clickout item,109038,AU,"Sydney, Australia",mobile,,3400638|1253714|3367857|5100540|1088584|666916...,95|66|501|112|95|100|101|72|82|56|56|143|70|25...
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...
115,02SRUT1NQYH1,3599a6f709eab,1541063864,35,clickout item,2795374,FI,"Krakow, Poland",mobile,,2795374|5582964|1088390|2781070|1258068|127196...,64|54|36|121|76|81|92|40|73|52|98|104|56|414|6...
121,03K8AXBL4BX2,ec139e10b9238,1541100322,6,clickout item,1032816,UK,"London, United Kingdom",desktop,,12693|46363|81657|18448|47687|152913|18417|927...,104|92|100|103|102|104|72|85|81|75|107|86|98|8...
122,03K8AXBL4BX2,ec139e10b9238,1541100652,7,clickout item,1032816,UK,"London, United Kingdom",desktop,,12693|46363|81657|18448|47687|152913|18417|927...,104|92|100|103|102|104|72|85|81|75|107|86|98|8...


In [5]:
FinalClickoutDF = train[train.action_type=='clickout item'].groupby('session_id').tail(1)
FinalClickoutDF.head(1)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...


Finding number of times an item has been mentioned in a list.  
In order to get representative global values to the items, duplicated  must be removed. Rows would not all be duplicated because of some attributes such as timestamp, reference, and step.

In [6]:
ClickoutUniqueDF = ClickoutDF.drop_duplicates(subset=['session_id', 'impressions'], keep='first')

In [10]:
AllImpressionsList = ClickoutUniqueDF.impressions.apply(lambda x:x.split('|'))

AllImpressionsFlatList = []
for sublist in AllImpressionsList:
    for item in sublist:
        AllImpressionsFlatList.append(item)

InImpressionsCounter = Counter(AllImpressionsFlatList)
InImpressionsDF = pd.DataFrame.from_dict(InImpressionsCounter, orient='index').reset_index()\
                              .rename(columns={'index':'item_id', 0:'NumberInImpressions'})
InImpressionsDF.head()

Unnamed: 0,item_id,NumberInImpressions
0,3400638,492
1,1253714,84
2,3367857,161
3,5100540,179
4,1088584,147


In [11]:
InImpressionsDF.item_id.nunique(), len(InImpressionsDF), item_metadata.item_id.nunique()

(815092, 815092, 927142)

Number of items in this dataframe is less than the number of items in item_metadata, that's because some of the items had not been mentioned in the impressions list.

In [12]:
#left joining
item_metadata = item_metadata.merge(InImpressionsDF, on='item_id', how='left')

#filling NaN values with zeros
item_metadata.NoInImpressions = item_metadata.NumberInImpressions.fillna(0)

item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0
1,5416,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46,63.0
2,5834,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40,157.0
3,5910,"(Satellite TV, Sailing, Cosmetic Mirror, Telep...",41,112.0
4,6066,"(Satellite TV, Sailing, Diving, Cosmetic Mirro...",85,54.0


## Number of Times in Reference
The purpose of this feature is to check how many times an item has been mentioned in the Reference attribute in the whole train set. 

In [15]:
InReferencesCounter = Counter(train.reference.values.tolist())
InReferencesDF = pd.DataFrame.from_dict(InReferencesCounter, orient='index').reset_index()\
                              .rename(columns={'index':'item_id', 0:'NumberInReferences'})

InReferencesDF.head()

Unnamed: 0,item_id,NumberInReferences
0,Newtown,16
1,666856,26
2,109038,253
3,Surry Hills,21
4,1257342,74


In [17]:
#left joining
item_metadata = item_metadata.merge(InReferencesDF, on='item_id', how='left')

#filling NaN values with zeros
item_metadata.NumberInReferences = item_metadata.NumberInReferences.fillna(0)

item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0
1,5416,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46,63.0,43.0
2,5834,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40,157.0,102.0
3,5910,"(Satellite TV, Sailing, Cosmetic Mirror, Telep...",41,112.0,96.0
4,6066,"(Satellite TV, Sailing, Diving, Cosmetic Mirro...",85,54.0,47.0


## Number of Times in Clickout
The purpose of this feature is to check how many times an item has been clicked out

In [19]:
InClickoutCounter = Counter(ClickoutDF.reference.values.tolist())
InClickoutDF = pd.DataFrame.from_dict(InClickoutCounter, orient='index').reset_index()\
                              .rename(columns={'index':'item_id', 0:'NumberAsClickout'})

InClickoutDF.head()

Unnamed: 0,item_id,NumberInClickout
0,109038,53
1,1257342,20
2,2795374,42
3,1032816,4
4,65685,3


In [20]:
#left joining
item_metadata = item_metadata.merge(InClickoutDF, on='item_id', how='left')

#filling NaN values with zeros
item_metadata.NumberInClickout = item_metadata.NumberInClickout.fillna(0)

item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0
1,5416,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46,63.0,43.0,6.0
2,5834,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40,157.0,102.0,12.0
3,5910,"(Satellite TV, Sailing, Cosmetic Mirror, Telep...",41,112.0,96.0,5.0
4,6066,"(Satellite TV, Sailing, Diving, Cosmetic Mirro...",85,54.0,47.0,8.0


## Number of Time in Final Clickout
The purpose of this feature is to get the number of times an item had been mentioned as a final clickout.

In [24]:
FinalClickoutDF
InFinalClickoutCounter = Counter(FinalClickoutDF.reference.values.tolist())
InFinalClickoutDF = pd.DataFrame.from_dict(InFinalClickoutCounter, orient='index').reset_index()\
                              .rename(columns={'index':'item_id', 0:'NumberAsFinalClickout'})

InFinalClickoutDF.head()

Unnamed: 0,item_id,NumberAsFinalClickout
0,1257342,7
1,2795374,18
2,1032816,1
3,1320460,1
4,3143258,11


In [25]:
#left joining
item_metadata = item_metadata.merge(InFinalClickoutDF, on='item_id', how='left')

#filling NaN values with zeros
item_metadata.NumberAsFinalClickout = item_metadata.NumberAsFinalClickout.fillna(0)

item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout,NumberInFinalClickout,NumberAsFinalClickout
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0,4.0,4.0
1,5416,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46,63.0,43.0,6.0,2.0,2.0
2,5834,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40,157.0,102.0,12.0,1.0,1.0
3,5910,"(Satellite TV, Sailing, Cosmetic Mirror, Telep...",41,112.0,96.0,5.0,3.0,3.0
4,6066,"(Satellite TV, Sailing, Diving, Cosmetic Mirro...",85,54.0,47.0,8.0,6.0,6.0


In [27]:
item_metadata.head()

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout,NumberAsFinalClickout
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0,4.0
1,5416,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",46,63.0,43.0,6.0,2.0
2,5834,"(Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",40,157.0,102.0,12.0,1.0
3,5910,"(Satellite TV, Sailing, Cosmetic Mirror, Telep...",41,112.0,96.0,5.0,3.0
4,6066,"(Satellite TV, Sailing, Diving, Cosmetic Mirro...",85,54.0,47.0,8.0,6.0


The next step would be dividing the NumberAsFinalClickout by the other 3, in order to get FinalClickout Relativity

## Final Clickout To Impressions
The purpose of this feature is to get item's rate of clicking out when listed to the user.

In [32]:
FClickoutToImpressions = item_metadata.NumberAsFinalClickout/item_metadata.NumberInImpressions
FClickoutToImpressions.head()

0    0.044944
1    0.031746
2    0.006369
3    0.026786
4    0.111111
dtype: float64

In [34]:
#adding attribute
item_metadata['FClickoutToImpressions'] = FClickoutToImpressions
item_metadata.head(1)

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout,NumberAsFinalClickout,FClickoutToImpressions
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0,4.0,0.044944


## Final Clickout To References
The purpose of this feature is to get item's rate of clicking out when it was interacted with.

In [35]:
FClickoutToReferences = item_metadata.NumberAsFinalClickout/item_metadata.NumberInReferences
FClickoutToReferences.head()

0    0.285714
1    0.046512
2    0.009804
3    0.031250
4    0.127660
dtype: float64

In [36]:
#adding attribute
item_metadata['FClickoutToReferences'] = FClickoutToReferences
item_metadata.head(1)

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout,NumberAsFinalClickout,FClickoutToImpressions,FClickoutToReferences
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0,4.0,0.044944,0.285714


## Final Clickout To Clickout
The purpose of this feature is to get item's rate of clickout when it was clicked out before.

In [37]:
FClickoutToClickout = item_metadata.NumberAsFinalClickout/item_metadata.NumberInClickout
FClickoutToClickout.head()

0    0.571429
1    0.333333
2    0.083333
3    0.600000
4    0.750000
dtype: float64

In [38]:
#adding attribute
item_metadata['FClickoutToClickout'] = FClickoutToClickout
item_metadata.head(1)

Unnamed: 0,item_id,properties,NumberOfProperties,NumberInImpressions,NumberInReferences,NumberInClickout,NumberAsFinalClickout,FClickoutToImpressions,FClickoutToReferences,FClickoutToClickout
0,5101,"(Satellite TV, Golf Course, Airport Shuttle, C...",62,89.0,14.0,7.0,4.0,0.044944,0.285714,0.571429


## Item's Average Rank
The purpose of this feature is to get item's position in the list provided to the user across the train set.  
Since the purpose is to get the average rank across the lists shown to users, an important notice which is that some sessions have different reference and click outs, which provides the same impression list. Duplicated impressions lists in each session should be dropped. (Same thing applies to price as well.)

In [46]:
# using All Clickout dataframe, but the one with the unique impressions for each session for this feature
ClickoutUniqueDF.head(1)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
13,00RL8Z82B2Z1,aff3928535f48,1541037543,14,clickout item,109038,AU,"Sydney, Australia",mobile,,3400638|1253714|3367857|5100540|1088584|666916...,95|66|501|112|95|100|101|72|82|56|56|143|70|25...


In [None]:
UniqueSessionImpressionsDF = ClickoutUniqueDF[['session_id', 'impressions']]
UniqueSessionImpressionsDF.impressions = UniqueSessionImpressionsDF.impressions.apply(lambda x: x.split('|'))
pd.DataFrame({'session_id':UniqueSessionImpressionsDF.session_id.repeat(UniqueSessionImpressionsDF.impressions.str.len()),
              'item_id':UniqueSessionImpressionsDF.impressions.sum()})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [54]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [50]:
pd.DataFrame({'session_id':ClickoutUniqueDF[['session_id', 'impressions']].impressions.apply(lambda x: x.split('|'))

13          [3400638, 1253714, 3367857, 5100540, 1088584, ...
15          [55109, 129343, 54824, 2297972, 109014, 125734...
115         [2795374, 5582964, 1088390, 2781070, 1258068, ...
121         [12693, 46363, 81657, 18448, 47687, 152913, 18...
176         [1306936, 56482, 2842358, 6881276, 65685, 6325...
180         [1258184, 3866722, 8929970, 2315702, 116619, 1...
181         [2349076, 2552514, 7159866, 3898458, 1846017, ...
184         [6721, 6724, 40109, 147227, 80983, 6719, 40718...
188         [22953, 22951, 22947, 22949, 8774118, 45527, 8...
196         [129881, 36161, 36249, 464421, 302607, 36401, ...
230         [8561, 8564, 8568, 8586, 8588, 102080, 8613, 5...
236         [2603858, 1765201, 1844107, 99445, 99438, 2219...
237         [921237, 147262, 42194, 9051714, 42109, 42125,...
240                      [2035675, 4095738, 4933410, 9167996]
260         [104499, 104710, 104724, 164677, 960133, 58165...
264         [9172186, 6816668, 4497388, 1827505, 2066390, ...
280     

## Item's Average Price Final Clickout
The purpose of this feature is to get item's average price accross the train set.

**Unkown cells

In [None]:
#loading train set
ReferenceTrain = pd.read_csv('/content/drive/My Drive/Trivago/Clean Dataset/train.csv')
ReferenceTrain = ReferenceTrain.drop(columns='Unnamed: 0')

#converting words values in referenc into NaN
ReferenceTrain.reference = ReferenceTrain.reference.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna().astype(int).apply(lambda x: str(x))

#dropping NaN in reference
ReferenceTrain = ReferenceTrain.drop(ReferenceTrain[ReferenceTrain.reference.isna()].index.tolist())

#what I meant by the previous step
train.reference, ReferenceTrain.reference

In [None]:
ReferenceTimeSpent = (ReferenceTrain.groupby(['session_id', 'reference']).timestamp.max() 
                      - ReferenceTrain.groupby(['session_id', 'reference']).timestamp.min())
ReferenceTimeSpent

In [None]:
ReferenceTrain.to_csv('train_ref.csv')                  #DO NOT RUN BEFORE INVESTIGATING FOR THE BEST PRACTICES
!cp train_ref.csv '/content/drive/My Drive/Trivago/Clean Dataset/'

##Feature Engineering

###Items Interacted With & Number of Interactions

Taking a sample of the training set to perform trials

In [None]:
trainlet = train.iloc[0:10000]

#getting session_id with no clickouts
session_idNoClickouts = list(set(trainlet.session_id.unique()) - set(trainlet[trainlet.action_type == 'clickout item'].session_id.unique()))

#dropping sessions with no clickouts
trainlet = trainlet[~trainlet.session_id.isin(session_idNoClickouts)].drop(columns='Unnamed: 0')

#will be working on a grouped by sample, having the count of steps for each reference got interacted with
trainlet1 = trainlet.groupby(['session_id', 'reference'], sort=False).step.count().to_frame()

#converting an session_id as index into a column
trainlet1.reset_index(level=0, inplace=True)

#converting an reference as index into a column
trainlet1.reset_index(level=0, inplace=True)

#changing order of columns
trainlet1 = trainlet1[['session_id', 'reference','step']]

#takng a look
trainlet1.reference.head(10)

In [None]:
#converting alphabetic values in reference attribute into NaN by converting all values into numeric, then again converting values into string
trainlet1.reference = trainlet1.reference.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna().astype(int).apply(lambda x: str(x))

#taking a look
trainlet1.reference.head(10)

In [None]:
#for the sake of comparison creating another version of trainlet by repeating the previous steps
trainlet0 = trainlet.groupby(['session_id', 'reference'], sort=False).step.count().to_frame()
trainlet0.reset_index(level=0, inplace=True)
trainlet0.reset_index(level=0, inplace=True)
trainlet0 = trainlet0[['session_id', 'reference','step']]

#taking a look
trainlet0

In [None]:
#changing the name of attribute step into NumberOfInteractions
trainlet0 = trainlet0.rename(columns = {'step':'NumberOfInteractions'})
trainlet1 = trainlet1.rename(columns = {'step':'NumberOfInteractions'})

#taking a look
trainlet1

In [None]:
#for future use before dropping the NaN values, getting the indeces of the word values in reference attribute
#getting the index of words values in reference attribute
WordsInReferenceIndex = list(pd.isnull(trainlet1).any(1).nonzero())[0].tolist()

#getting reference with words dataframe
ReferenceWordsDF = trainlet0.iloc[WordsInReferenceIndex, :]  #for future**

In [None]:
#dropping NaN values(rows)
trainlet1 = trainlet1.dropna()

#taking a look
trainlet1

In [None]:
#getting reference and Number of interactions lists
InteractedWithItems = trainlet1.groupby('session_id', sort=False)['reference'].apply(list)
NumberOfInteractions = trainlet1.groupby('session_id', sort=False)['NumberOfInteractions'].apply(list)

#taking a look
InteractedWithItems, NumberOfInteractions

In [None]:
#converting series into lists
InteractedWithItems = InteractedWithItems.tolist()
NumberOfInteractions = NumberOfInteractions.tolist()

Adding the two attributes to trainlet

In [None]:
#getting a list of indeces InteractedWithItems and NumberOfInteractions should be left joined to
FinalClickoutIndex = trainlet[trainlet.action_type=='clickout item'].groupby('session_id').tail(1).index.tolist()

#creating InteractedWithItems dataframe
InteractedWithItemsDF = pd.DataFrame({'FinalClickoutIndex':FinalClickoutIndex, 'InteractedWitItems':InteractedWithItems,
                                      'NumberOfInteractions':NumberOfInteractions}).set_index('FinalClickoutIndex')

#left join on dataset
trainlet = trainlet.join(InteractedWithItemsDF)
trainlet

###Time Spent

Creating a time spent attribute for each reference within a session.

In [None]:
#This should be a seperate function

#getting session_id with no clickouts
session_idNoClickouts = list(set(trainlet.session_id.unique()) - set(trainlet[trainlet.action_type == 'clickout item'].session_id.unique()))

#dropping sessions with no clickouts
trainlet = trainlet[~trainlet.session_id.isin(session_idNoClickouts)].drop(columns='Unnamed: 0')

#obtaining the seconds spent on a reference by subtracting the time started viewing the item till the time of last interaction with the item
time_spent = trainlet.groupby(['session_id', 'reference'], sort=False).timestamp.apply(lambda x:(x.max() - x.min())).to_frame()

#converting an session_id as index into a column
time_spent.reset_index(level=0, inplace=True)

#converting an reference as index into a column
time_spent.reset_index(level=0, inplace=True)

#changing order of columns
time_spent = time_spent[['session_id', 'reference','timestamp']]

#changing the timestamp into SecondsSpent
time_spent = time_spent.rename(columns = {'timestamp':'SecondsSpent'})

#converting alphabetic values in reference attribute into NaN by converting all values into numeric, then again converting values into string
time_spent.reference = time_spent.reference.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna().astype(int).apply(lambda x: str(x))

#dropping NaN values(rows)
time_spent = time_spent.dropna()

#getting time spent of interactions lists
SecondsSpent = time_spent.groupby('session_id', sort=False)['SecondsSpent'].apply(list)

#converting into list
SecondsSpent = SecondsSpent.tolist()

#getting a list of indeces SecondsSpent to be left joined
FinalClickoutIndex = trainlet[trainlet.action_type=='clickout item'].groupby('session_id').tail(1).index.tolist()

#creating Seconds dataframe
SecondsSpentDF = pd.DataFrame({'FinalClickoutIndex':FinalClickoutIndex, 'SecondsSpent':SecondsSpent}).set_index('FinalClickoutIndex')

#left join on dataset
trainlet = trainlet.join(SecondsSpentDF)
trainlet

In [None]:
time_spent.groupby('session_id', sort=False)['SecondsSpent'].apply(list)

In [None]:
SecondsSpent = time_spent.groupby('session_id', sort=False)['SecondsSpent'].apply(list)

In [None]:
#creating a function
def SecondsSpent(dataset):
  #getting session_id with no clickouts
  session_idNoClickouts = list(set(dataset.session_id.unique()) - set(dataset[dataset.action_type == 'clickout item'].session_id.unique()))

  #dropping sessions with no clickouts
  dataset = dataset[~dataset.session_id.isin(session_idNoClickouts)].drop(columns='Unnamed: 0')

  #obtaining the seconds spent on a reference by subtracting the time started viewing the item till the time of last interaction with the item
  time_spent = dataset.groupby(['session_id', 'reference'], sort=False).timestamp.apply(lambda x:(x.max() - x.min())).to_frame()

  #converting an session_id as index into a column
  time_spent.reset_index(level=0, inplace=True)

  #converting an reference as index into a column
  time_spent.reset_index(level=0, inplace=True)

  #changing order of columns
  time_spent = time_spent[['session_id', 'reference','timestamp']]

  #changing the timestamp into SecondsSpent
  time_spent = time_spent.rename(columns = {'timestamp':'SecondsSpent'})

  #converting alphabetic values in reference attribute into NaN by converting all values into numeric, then again converting values into string
  time_spent.reference = time_spent.reference.apply(lambda x: pd.to_numeric(x, errors='coerce')).dropna().astype(int).apply(lambda x: str(x))

  #dropping NaN values(rows)
  time_spent = time_spent.dropna()

  #getting time spent of interactions lists
  SecondsSpent = time_spent.groupby('session_id', sort=False)['SecondsSpent'].apply(list)

  #converting into list
  SecondsSpent = SecondsSpent.tolist()

  #getting a list of indeces SecondsSpent to be left joined
  FinalClickoutIndex = dataset[dataset.action_type=='clickout item'].groupby('session_id').tail(1).index.tolist()

  #creating Seconds dataframe
  SecondsSpentDF = pd.DataFrame({'FinalClickoutIndex':FinalClickoutIndex, 'SecondsSpent':SecondsSpent}).set_index('FinalClickoutIndex')

  #left join on dataset
  dataset = dataset.join(SecondsSpentDF)
return dataset

In [None]:
trainlet2 = train.iloc[0:10000]
trainlet2.head()

In [None]:
SecondsSpent(trainlet2)

In [None]:
pd.DataFrame({'FinalClickoutIndex':FinalClickoutIndex, 'SecondsSpent':SecondsSpent}).set_index('FinalClickoutIndex')

In [None]:
trainlet[trainlet.action_type=='clickout item'].groupby('session_id').tail(1).index.tolist()

In [None]:
#dropping NaN values(rows)
trainlet1 = trainlet1.dropna()

#taking a look
trainlet1

In [None]:
#getting a list of values of time spent on each reference in a session
time_spent_values = time_spent.values

#getting the index of the time_spent_values in dataset
index = trainlet.groupby(['session_id', 'reference'], sort=False).tail(1).index

#creating a dataframe for a left join on train set
time_spent_df = pd.DataFrame({'index':index, 'seconds_spent':time_spent_values})
time_spent_df.head(2)

In [None]:
#making the index column as the index for dataframe
time_spent_df = time_spent_df.set_index('index')

#left join to train set on index
train = train.join(time_spent_df)
train.head(15)

In [None]:
#exporting dataframe to Google drive  DO NOT RUN THIS CELL UNLESS MODIFIED
train.to_csv('train.csv')

In [None]:
#loading file                         DO NOT RUN THIS CELL UNLESS MODIFIED (SEARCH FOR BEST PRACTICES IN THIS CASE)   
train = pd.read_csv('/content/drive/My Drive/Trivago/Clean Dataset/train.csv')
train.head(15)

###Price

In [None]:
Taking a session as an example, if there are three unique references. If we had the ones that have the same impressions which are the first two, and we get a table out of the impressions and prices, we will find that they have the same price exactly.
By proceeding with the KNN on this table(features can later on be added, and analysis on how different properties matter), we can have a list of 5 items at least(which are the most important) and go from there.	

In [None]:
len(train[train.session_id == 'aff3928535f48'][train.action_type == 'clickout item'].tail(1).impressions.values[0].split('|'))

In [None]:
items = train[train.session_id == 'aff3928535f48'][train.action_type == 'clickout item'].tail(1).impressions.values[0].split('|')

In [None]:
all_interacted_with_items = train[train.session_id == 'aff3928535f48'].reference.unique().tolist()
interacted_with_items = []
interacted_with_items_prices = []
for item in all_interacted_with_items:
  for i in range(len(impression)):
    if item == impression[i]:
        interacted_with_items.append(item)
        print(item)
        interacted_with_items_prices.append(price[i])
        print(price[i])

In [None]:
for item in all_interacted_with_items:
  for i in range(len(impression)):
    if item == impression[i]:
      print(item)

In [None]:
all_interacted_with_items

In [None]:
def impression_price(session_id):               #the isssue with this function is that ignores the first clickouts
                                                #(there are valuable infromation that can be extracted)
  try:                                          #some of the functions don't apply on the some sessions
    impression = train[train.session_id == session_id][train.action_type == 'clickout item'].tail(1).impressions.values[0].split('|')
    price = train[train.session_id == session_id][train.action_type == 'clickout item'].tail(1).prices.values[0].split('|')
    price = list(map(int, price))               #converting list of strings into integers
    clickout_item = train[train.session_id == session_id][train.action_type == 'clickout item'].tail(1).reference.values[0]
    all_interacted_with_items = train[train.session_id == session_id].reference.unique().tolist()
    for i in range(len(impression)):            #getting the clickout item
      if clickout_item == impression[i]:
        rank = i
    interacted_with_items = []
    interacted_with_items_prices = []
    for item in all_interacted_with_items:      #getting interacted with items
      for i in range(len(impression)):
        if item == impression[i]:
          interacted_with_items.append(item)
          interacted_with_items_prices.append(price[i])
    plt.figure(figsize=(10,8))
    plt.title('Impressions and Prices', fontsize=30)
    plt.xlabel('Impressions', fontsize=20)
    plt.ylabel('Price', fontsize=20)
    plt.xticks(rotation=90)
    plt.plot(impression, price, 'o')
    plt.plot(interacted_with_items, interacted_with_items_prices, 'o', color='red')
    plt.plot(clickout_item, price[rank] , 'o', color='black')
  except:
    pass

listOfSessions = random.choices(train.session_id.unique(), k=10)
for session_id in listOfSessions:
  impression_price(session_id)


TRY \ to have a the code running normally in a well organized shape

In [None]:
listOfSessions = train.session_id.unique()[0:100]
for session_id in listOfSessions:
  impression_price(session_id)

Impressions shown on the graphs are put in a order of the rank provided by Trivago last list. After taking a quick look at the graphs (sample), I can see a pattern of having the black dot (clickout item) somewhere close to the red dots, at least not very far away. 
I'll need to validate that the features prices and ranks play an important role of user choice eventually.