# Data import

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
FILES_DIR = './files/'
total = pd.read_csv(FILES_DIR + 'total.csv')


In [6]:
total.columns

Index(['Unnamed: 0', 'session_id', 'event_timestamp', 'event_name', 'user_no',
       'item_no', 'device_type', 'region', 'platform', 'rating', 'birth_date',
       'gender', 'age', 'item_name', 'image_name', 'price', 'category1_code',
       'category1_name', 'category2_code', 'category2_name', 'category3_code',
       'category3_name', 'brand_no', 'brand_name'],
      dtype='object')

In [7]:
total.platform.unique()

array(['IOS', 'ANDROID'], dtype=object)

In [8]:
male = total[total['gender'] == 'M'] #남자고객데이터
female = total[total['gender'] == 'F'] #여자고객데이터
android = total[total['platform'] == 'ANDROID'] #안드로이드고객데이터
ios = total[total['platform'] == 'IOS'] # IOS고객데이터

## 전체데이터

In [9]:
def recom_item(item):
    return total.loc[product_mean.sort_values(by='rating', ascending=False)[:item].index]['item_name']

product_mean = total.groupby(['item_no'])['rating'].mean()
product_mean = product_mean.reset_index()
recom_item(5)

7633    SWEATSHIRT THE FLOWER MARDI HEATHER-ORANGE
1194        [707-08227] FREE STYLE CARD CASE_BLACK
1140                    Dry flower earring [green]
4524                              앙떼 메리제인 - 버건디 와니
3810                 Benjamin-OB326-Crystal Yellow
Name: item_name, dtype: object

In [10]:
# train, test split
x = total.copy()
y = total['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(session_id, item) for (session_id, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [11]:
def best_seller(session_id, item_no):
  try:
    rating = train_mean[item_no]
  except KeyError:
    rating = 2.5
  return rating

train_mean = x_train.groupby(['item_no'])['rating'].mean()
print('best-seller 방식의 정확도:', score(best_seller))

best-seller 방식의 정확도: 1.341969014456113


# 남성고객에게 추천

In [12]:
def recom_item(item):
    return male.loc[product_mean.sort_values(by='rating', ascending=False)[:item].index]['item_name']


product_mean = male.groupby(['item_no'])['rating'].mean()
product_mean = product_mean.reset_index()


recom_item(1)

# 2번째 추천 아이템에 1978번인데 sampling 하는 과정에서 아이템 1978번이 없어져서 에러가 나온다 그래서 1개만...

365    ts1201 포켓티-블랙
Name: item_name, dtype: object

In [13]:
male[male['item_name'] == 1978]

Unnamed: 0.1,Unnamed: 0,session_id,event_timestamp,event_name,user_no,item_no,device_type,region,platform,rating,...,image_name,price,category1_code,category1_name,category2_code,category2_name,category3_code,category3_name,brand_no,brand_name


In [14]:
product_mean.sort_values(by='rating', ascending=False)[:5].index

Int64Index([365, 1978, 1306, 1307, 648], dtype='int64')

In [15]:
# train, test split
x = male.copy()
y = male['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(session_id, item) for (session_id, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [16]:
def best_seller(user_no, item_no):
  try:
    rating = train_mean[item_no]
  except KeyError:
    rating = 2.5
  return rating

train_mean = x_train.groupby(['item_no'])['rating'].mean()
print('best-seller 방식의 정확도:', score(best_seller))

best-seller 방식의 정확도: 1.4235553545932509


# 여성고객에게 추천

In [17]:
def recom_item(item):
    return female.loc[product_mean.sort_values(by='rating', ascending=False)[:item].index]['item_name']

product_mean = female.groupby(['item_no'])['rating'].mean()
product_mean = product_mean.reset_index()
recom_item(5)

9828            시스테마 벤토 런치 박스 1.65L 4colors
2693          S패드 롤업티 - WHITE, KHAKI, BLACK
9704    21SS Women Editor Shirt (Dark Navy)
688          PREVAIL Sweat Shorts (Vanilla)
7690       WAVY LAMPSHADE HAT_FOAMING CREAM
Name: item_name, dtype: object

In [18]:
# train, test split
x = female.copy()
y = female['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(session_id, item) for (session_id, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [19]:
def best_seller(session_id, item_no):
  try:
    rating = train_mean[item_no]
  except KeyError:
    rating = 2.5
  return rating

train_mean = x_train.groupby(['item_no'])['rating'].mean()
print('best-seller 방식의 정확도:', score(best_seller))

best-seller 방식의 정확도: 1.3447435357817923


# 안드로이드 고객 추천

In [20]:
def recom_item(item):
    return android.loc[product_mean.sort_values(by='rating', ascending=False)[:item].index]['item_name']

product_mean = android.groupby(['item_no'])['rating'].mean()
product_mean = product_mean.reset_index()
recom_item(1)

# 남성고객데이터랑 똑같은 에러

1004    SAKUZAN COFFEE CUP SET GRAY ( 커피컵, 찻잔, 매트한 질감,...
Name: item_name, dtype: object

In [21]:
product_mean.sort_values(by='rating', ascending=False)[:5].index

Int64Index([1004, 4105, 2353, 2760, 2352], dtype='int64')

In [22]:
# train, test split
x = android.copy()
y = android['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(session_id, item) for (session_id, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [23]:
def best_seller(session_id, item_no):
  try:
    rating = train_mean[item_no]
  except KeyError:
    rating = 2.5
  return rating

train_mean = x_train.groupby(['item_no'])['rating'].mean()
print('best-seller 방식의 정확도:', score(best_seller))

best-seller 방식의 정확도: 1.250605715623896


# IOS 고객 추천

In [29]:
def recom_item(item):
    return ios.loc[product_mean.sort_values(by='rating', ascending=False)[:item].index]['item_name']

product_mean = ios.groupby(['item_no'])['rating'].mean()
product_mean = product_mean.reset_index()
recom_item(2)

7297    아페쎄 여성 아이템 반팔 티셔츠 화이트 COEOP F26012 AAB
9037      [오마이걸 효정착용](CTD2) 브리즈 핀턱 코튼 블라우스_화이트
Name: item_name, dtype: object

In [25]:
product_mean.sort_values(by='rating', ascending=False)[:5].index

Int64Index([7297, 9037, 2919, 2920, 8706], dtype='int64')

In [26]:
# train, test split
x = ios.copy()
y = ios['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(session_id, item) for (session_id, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [27]:
def best_seller(session_id, item_no):
  try:
    rating = train_mean[item_no]
  except KeyError:
    rating = 2.5
  return rating

train_mean = x_train.groupby(['item_no'])['rating'].mean()
print('best-seller 방식의 정확도:', score(best_seller))

best-seller 방식의 정확도: 1.4037116930862468
