In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import os

In [3]:
path = "./data/"
books_df = pd.read_csv(path + "books_word2vec.csv", index_col="Unnamed: 0")
users_df = pd.read_csv(path + "users.csv")
rating_df = pd.read_csv(path + "train_ratings.csv")

In [4]:
print(books_df.columns)
print(users_df.columns)
print(rating_df.columns)

Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path'],
      dtype='object')
Index(['user_id', 'location', 'age'], dtype='object')
Index(['user_id', 'isbn', 'rating'], dtype='object')


In [None]:
t = users_df[(users_df['age'] >= 85) | (users_df['age'] <= 10)]
users_df.drop(t.index, axis=0, inplace=True)

In [None]:
books_df.drop(columns = ["img_path","img_url"], inplace=True)

In [None]:
books_df['location'] = books_df['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # 특수문자 제거

books_df['location_city'] = books_df['location'].apply(lambda x: x.split(',')[0].strip())
books_df['location_state'] = books_df['location'].apply(lambda x: x.split(',')[1].strip())
books_df['location_country'] = books_df['location'].apply(lambda x: x.split(',')[2].strip())

books_df = books_df.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
books_df = books_df.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.

books_df.drop('location', axis=1, inplace=True)

In [None]:
user_rating_df = pd.merge(users_df, rating_df, how='left', on='user_id')
data = pd.merge(user_rating_df, books_df, how='left', on='isbn')
data.head(3)

In [None]:
print(len(data))
data['user_id'].nunique()

In [None]:
def fillAge(idx):
    global data
    d2 = data.drop(idx, axis=0)
    d2 = d2.groupby('isbn').aggregate({"age":np.average})
    d2['age'] = d2['age'].astype(int)
    
    avg_age = {}
    for i, isbn in data.loc[idx,'isbn'].items():
        try: 
            avg_age[i] = d2.loc[isbn, 'age']
        except:
            pass
        
    avg_age = pd.DataFrame(avg_age, index=[0]).T
    avg_age.rename(columns={0:"avgAge"}, inplace=True)
    return avg_age

In [None]:
age_na_idx = data[data['age'].isna()].index

In [None]:
a = fillAge(age_na_idx)
a

In [None]:
data.loc[a.index, 'age'] = a['avgAge'].apply(lambda x: x)
data.loc[age_na_idx].head(5)

user 데이터 age 처리한거로 바꾸기..

In [None]:
def make_user_csv_df():
    ucpy = users_df.copy(deep=True)
    t = data[['user_id','age']]
    idx = t['age'].dropna().index
    t = t.loc[idx]
    t = t.groupby('user_id').aggregate({"age":np.max}).squeeze()
    ucpy['user_id2'] = ucpy['user_id'].copy()
    a = ucpy[['user_id', 'user_id2']].set_index('user_id2').squeeze()
    a = a.map(t).reset_index()
    a.columns = ['user_id','age']
    ucpy['age'] = a['age']
    ucpy.drop(columns = 'user_id2', inplace=True)
    return ucpy

ucpy = make_user_csv_df()

## Book

In [17]:
books_df = pd.read_csv("./data/books_word2vec.csv")
print(books_df['publisher'].nunique())
print(books_df['publisher'].isna().sum())

11571
0


In [12]:
import re
import tqdm
bcpy = books_df.copy(deep=True)
bcpy.loc[bcpy[bcpy['publisher'].notnull()].index, 'publisher'] = bcpy[bcpy['publisher'].notnull()]['publisher'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
bcpy['publisher'] = bcpy['publisher'].str.lower()
bcpy['publisher'] = bcpy['publisher'].apply(lambda x : re.sub('books?|publishing|publisher','',x).strip())
bcpy2 = bcpy.copy(deep=True)

# publisher의 리스트 만들기 위함
publisher = bcpy2.groupby("publisher").count()['isbn'].sort_values(ascending=False)
publisher = publisher[publisher.values>4]
publisher_list = []
for p in tqdm.tqdm(publisher.index):
    # p가 너무 다양한 publisher를 포함할 수 있는 단어들은 제외
    if p =='i' or p == 'pan' or p == 'roc' or p == 'que' \
               or p == 'ump' or p== 'asa' or p=='pol' or p=='rac': continue
    
    # a a 등 이상한 publisher들은 중간 공백 제거후 너무 짧은건 제외 (tv도 있었지만 그냥 제외)
    w = p.replace(" ",'')
    if len(w) < 3 : continue
    
    cont_p = bcpy2[bcpy2['publisher'].str.contains(p)]['publisher']
    idx = cont_p.index
    publisher_list.append(p)
    bcpy2.drop(idx, axis=0, inplace=True)
    
print('---finish---')

100%|██████████| 2507/2507 [00:42<00:00, 58.96it/s] 

---finish---





In [13]:
print("기존 publisher nunique : ", bcpy['publisher'].nunique())
print("수정후 : ", len(publisher_list))

기존 publisher nunique :  10938
수정후 :  2496


In [14]:
idx_list = []
# 속도 조금이라도 빠르게 하기 위해서 인덱스를 계속 삭제하기 위한 df
bcpy2 = bcpy.copy(deep=True)
for publisher in tqdm.tqdm(publisher_list):
    idx = bcpy2[bcpy2['publisher'].str.contains(publisher)].index
    idx_list.extend(idx)
    bcpy.loc[idx,'publisher'] = publisher
    bcpy2.drop(idx, axis=0, inplace=True)
    
idx = bcpy.drop(idx_list, axis=0).index
bcpy.loc[idx, 'publisher'] = 'etc'
print('---finish---')

100%|██████████| 2496/2496 [00:43<00:00, 57.90it/s] 


---finish---


In [15]:
bcpy['publisher'].value_counts()

etc                      8268
penguin                  4686
bantam                   4514
tor                      3462
harpercollins            3230
                         ... 
trafford                    5
rowman littlefield s        5
univ of new mexico pr       5
william c brown             5
masthof pr                  5
Name: publisher, Length: 1723, dtype: int64

In [170]:
b = pd.read_csv("./testdata/books_by_publisher.csv")
u = pd.read_csv("./testdata/users.csv")

b.columns

Index(['Unnamed: 0', 'isbn', 'book_author', 'year_of_publication', 'publisher',
       'category'],
      dtype='object')

In [None]:
path = "./testdata/"
ucpy.to_csv(path + 'users.csv', index=False)
bcpy.to_csv(path + 'books.csv', index=False)

NameError: name 'ucpy' is not defined

In [176]:
!python3 main.py --WANDB False --DATA_PATH "./testdata/" --MODEL FFM --EPOCHS 30 --LR 0.01

--------------- FFM Load Data ---------------
--------------- FFM Train/Valid Split ---------------
Traceback (most recent call last):
  File "main.py", line 191, in <module>
    main(args)
  File "main.py", line 55, in main
    data = context_data_loader(args, data)
  File "/opt/ml/input/code/src/data/context_data.py", line 152, in context_data_loader
    print(torch.LongTensor(data['X_train'].values))
TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.


In [None]:
!python3 main.py --DATA_PATH "./testdata/" --MODEL DeepCoNN --LR 0.01 --EPOCHS 50

In [None]:
!python3 main.py --DATA_PATH "./testdata/" --MODEL NCF --EPOCHS 30 --LR 0.01

In [None]:
!python3 main.py --DATA_PATH "./testdata/" --MODEL DCN

In [None]:
!python3 main.py --DATA_PATH "./testdata/" --MODEL DEEPFM

In [35]:
!python3 main.py -h

usage: main.py [-h] [--DATA_PATH DATA_PATH]
               [--MODEL {FM,FFM,NCF,WDN,DCN,CNN_FM,DeepCoNN,DEEPFM}]
               [--DATA_SHUFFLE DATA_SHUFFLE] [--TEST_SIZE TEST_SIZE]
               [--SEED SEED] [--WANDB WANDB] [--BATCH_SIZE BATCH_SIZE]
               [--EPOCHS EPOCHS] [--LR LR] [--WEIGHT_DECAY WEIGHT_DECAY]
               [--DEVICE {cuda,cpu}] [--FM_EMBED_DIM FM_EMBED_DIM]
               [--FFM_EMBED_DIM FFM_EMBED_DIM]
               [--DEEPFM_EMBED_DIM DEEPFM_EMBED_DIM]
               [--NCF_EMBED_DIM NCF_EMBED_DIM] [--NCF_MLP_DIMS NCF_MLP_DIMS]
               [--NCF_DROPOUT NCF_DROPOUT] [--WDN_EMBED_DIM WDN_EMBED_DIM]
               [--WDN_MLP_DIMS WDN_MLP_DIMS] [--WDN_DROPOUT WDN_DROPOUT]
               [--DCN_EMBED_DIM DCN_EMBED_DIM] [--DCN_MLP_DIMS DCN_MLP_DIMS]
               [--DCN_DROPOUT DCN_DROPOUT] [--DCN_NUM_LAYERS DCN_NUM_LAYERS]
               [--CNN_FM_EMBED_DIM CNN_FM_EMBED_DIM]
               [--CNN_FM_LATENT_DIM CNN_FM_LATENT_DIM]
               [--DE

In [None]:
!python3 ensemble.py --ENSEMBLE_FILES 20221029_134942_FM,20221029_135408_FFM,20221029_143714_DeepCoNN --ENSEMBLE_STRATEGY WEIGHTED --ENSEMBLE_WEIGHT 0.2,0.3,0.5

In [22]:
for i in $(lsof /dev/nvidia1 | grep python | awk '{print $2}' | sort -u); do kill -9 $i; done

SyntaxError: invalid syntax (1820996398.py, line 1)

In [19]:
import torch
torch.cuda.empty_cache()