In [5]:
# データのダウンロードと解凍
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data_download/
!unzip -n ../data_download/ml-10m.zip -d ../data_download/

File ‘../data_download/ml-10m.zip’ already there; not retrieving.

Archive:  ../data_download/ml-10m.zip


In [6]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [7]:
# movieIDとタイトル名のみ使用
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data_download/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10676,65088,Bedtime Stories (2008),Adventure|Children|Comedy
10677,65091,Manhattan Melodrama (1934),Crime|Drama|Romance
10678,65126,Choke (2008),Comedy|Drama
10679,65130,Revolutionary Road (2008),Drama|Romance


In [4]:
# ユーザが付与した映画のタグ情報の読み込み
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data_download/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

user_tagged_movies

Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835
...,...,...,...,...
95575,71556,1377,Gothic,1188263571
95576,71556,2424,chick flick,1188263606
95577,71556,3033,comedy,1188263626
95578,71556,3081,Gothic,1188263565


In [5]:
# 評価値データの読み込み
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data_download/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')

# データ量が多いため、ユーザー数を1000に絞る
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]

ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392
...,...,...,...,...
132825,1053,33794,5.0,1134008301
132826,1053,34162,5.0,1134007983
132827,1053,34319,3.5,1134007773
132828,1053,35836,5.0,1134008021


In [5]:
# デモデータのcsvファイルをdataに作成
movies.to_csv('../data/movies_demo.csv', index=False)
user_tagged_movies.to_csv('../data/tags_demo.csv', index=False)
ratings.to_csv('../data/ratings_demo.csv', index=False)

In [6]:
# genreをlist形式で保持する
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))

# tagを小文字にする
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()

# tagを映画ごとにlist形式で保持する
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# タグ情報を結合する
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 映画のデータと評価のデータを結合する
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 学習用とテスト用にデータを分割する
# 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
# まずは、それぞれのユーザが評価した映画の順序を計算する
# 直近付与した映画から順番を付与していく(1始まり)
movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first').astype(int)
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

movielens_train = movielens_train.sort_values(['user_id', 'timestamp_rank']).reset_index(drop=True)
movielens_test = movielens_test.sort_values(['user_id', 'timestamp_rank']).reset_index(drop=True)

unique_users=1000, unique_movies=6736


In [7]:
movielens_train

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
0,1,594,5.0,838984679,Snow White and the Seven Dwarfs (1937),"[Animation, Children, Drama, Fantasy, Musical]","[disney, classic, disney, national film regist...",6
1,1,370,5.0,838984596,Naked Gun 33 1/3: The Final Insult (1994),"[Action, Comedy]","[police, arma nua, comedy, slapstick crap, com...",7
2,1,355,5.0,838984474,"Flintstones, The (1994)","[Children, Comedy, Fantasy]","[animation remade as live action, based on a t...",8
3,1,539,5.0,838984068,Sleepless in Seattle (1993),"[Comedy, Drama, Romance]","[girlie movie, meg ryan, tom hanks, empire sta...",9
4,1,586,5.0,838984068,Home Alone (1990),"[Children, Comedy]","[family, christmas, to see, comedy, crime, lat...",10
...,...,...,...,...,...,...,...,...
127825,1053,2194,4.0,1134007197,"Untouchables, The (1987)","[Action, Crime, Drama]","[gfei own it, gangster, seen more than once, c...",142
127826,1053,2302,4.5,1134007192,My Cousin Vinny (1992),[Comedy],"[entertaining, court, courtroom, funny, friday...",143
127827,1053,4886,5.0,1134007190,"Monsters, Inc. (2001)","[Animation, Children, Comedy, Fantasy]","[good cartoon children, pixar, pixar, dvd, pix...",144
127828,1053,3147,3.0,1134007182,"Green Mile, The (1999)","[Crime, Drama, Fantasy]","[prison, stephen king, stephen king, interesti...",145


In [8]:

movielens_test

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",1
1,1,616,5.0,838984941,"Aristocats, The (1970)","[Animation, Children]","[disney, disney, disney animated feature, fran...",2
2,1,362,5.0,838984885,"Jungle Book, The (1994)","[Adventure, Children, Romance]","[5, animated classic, bad remake, adapted from...",3
3,1,466,5.0,838984679,Hot Shots! Part Deux (1993),"[Action, Comedy, War]","[charlie sheen, comedy, parody, zaz, charlie s...",4
4,1,520,5.0,838984679,Robin Hood: Men in Tights (1993),[Comedy],"[parody, can't remember, very funny!, mel broo...",5
...,...,...,...,...,...,...,...,...
4995,1053,1242,5.0,1134008464,Glory (1989),"[Action, Drama, War]","[action, drama, war, c, historical lackluster,...",1
4996,1053,2501,5.0,1134008462,October Sky (1999),[Drama],"[liz should see, space program, true story, ae...",2
4997,1053,457,4.5,1134008458,"Fugitive, The (1993)",[Thriller],"[tommy lee jones, chase, tv series, excellent ...",3
4998,1053,2028,5.0,1134008444,Saving Private Ryan (1998),"[Action, Drama, War]","[world war ii, speilberg, steven spielberg, gf...",4


In [9]:
# 学習に用いる学習用データ中のユーザーと映画の組を取得する
train_keys = movielens_train[["user_id", "movie_id"]]
# 学習用データ中の評価値を学習の正解データとして取得する
train_y = movielens_train.rating

# 評価値を予測したいテスト用データ中のユーザーと映画の組を取得する
test_keys = movielens_test[["user_id", "movie_id"]]

# 学習用データ中の評価値を学習の正解データとして取得する
test_y = movielens_test.rating

In [10]:
train_keys

Unnamed: 0,user_id,movie_id
0,1,594
1,1,370
2,1,355
3,1,539
4,1,586
...,...,...
127825,1053,2194
127826,1053,2302
127827,1053,4886
127828,1053,3147


In [11]:
test_keys

Unnamed: 0,user_id,movie_id
0,1,122
1,1,616
2,1,362
3,1,466
4,1,520
...,...,...
4995,1053,1242
4996,1053,2501
4997,1053,457
4998,1053,2028


In [12]:
# 特徴量を作成する
train_x = train_keys.copy()
test_x = test_keys.copy()

In [13]:
train_x

Unnamed: 0,user_id,movie_id
0,1,594
1,1,370
2,1,355
3,1,539
4,1,586
...,...,...
127825,1053,2194
127826,1053,2302
127827,1053,4886
127828,1053,3147


In [14]:
# 学習用データに存在するユーザーごとの評価値の最小値、最大値、平均値
# 及び、映画ごとの評価値の最小値、最大値、平均値を特徴量として追加
aggregators = ["min", "max", "mean"]
user_features = movielens_train.groupby("user_id").rating.agg(aggregators).to_dict()
movie_features = movielens_train.groupby("movie_id").rating.agg(aggregators).to_dict()
for agg in aggregators:
    train_x[f"u_{agg}"] = train_x["user_id"].map(user_features[agg])
    test_x[f"u_{agg}"] = test_x["user_id"].map(user_features[agg])
    train_x[f"m_{agg}"] = train_x["movie_id"].map(movie_features[agg])
    test_x[f"m_{agg}"] = test_x["movie_id"].map(movie_features[agg])
# テスト用データにしか存在しないユーザーや映画の特徴量を、学習用データ全体の平均評価値で埋める
average_rating = train_y.mean()
test_x.fillna(average_rating, inplace=True)

In [15]:
train_x

Unnamed: 0,user_id,movie_id,u_min,m_min,u_max,m_max,u_mean,m_mean
0,1,594,5.0,0.5,5.0,5.0,5.000000,3.604839
1,1,370,5.0,0.5,5.0,5.0,5.000000,3.009091
2,1,355,5.0,0.5,5.0,5.0,5.000000,2.534483
3,1,539,5.0,0.5,5.0,5.0,5.000000,3.610169
4,1,586,5.0,0.5,5.0,5.0,5.000000,3.069820
...,...,...,...,...,...,...,...,...
127825,1053,2194,0.5,0.5,5.0,5.0,4.117021,3.851562
127826,1053,2302,0.5,0.5,5.0,5.0,4.117021,3.588785
127827,1053,4886,0.5,1.0,5.0,5.0,4.117021,3.899225
127828,1053,3147,0.5,2.0,5.0,5.0,4.117021,4.088496


In [16]:
test_x

Unnamed: 0,user_id,movie_id,u_min,m_min,u_max,m_max,u_mean,m_mean
0,1,122,5.0,1.0,5.0,4.5,5.000000,2.852941
1,1,616,5.0,2.0,5.0,5.0,5.000000,3.394737
2,1,362,5.0,0.5,5.0,5.0,5.000000,3.650000
3,1,466,5.0,0.5,5.0,5.0,5.000000,2.824074
4,1,520,5.0,0.5,5.0,5.0,5.000000,3.058511
...,...,...,...,...,...,...,...,...
4995,1053,1242,0.5,1.0,5.0,5.0,4.117021,4.120192
4996,1053,2501,0.5,2.0,5.0,5.0,4.117021,4.051282
4997,1053,457,0.5,0.5,5.0,5.0,4.117021,3.997389
4998,1053,2028,0.5,1.5,5.0,5.0,4.117021,4.171986


In [17]:
import itertools

# 映画が特定の genre であるかどうかを表す特徴量を追加
movie_genres = movies[["movie_id", "genre"]]
genres = set(list(itertools.chain(*movie_genres.genre)))

genres = sorted(genres)

for genre in genres:
    movie_genres[f"is_{genre}"] = movie_genres.genre.apply(lambda x: genre in x)
movie_genres.drop("genre", axis=1, inplace=True)
train_x = train_x.merge(movie_genres, on="movie_id")
test_x = test_x.merge(movie_genres, on="movie_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_genres[f"is_{genre}"] = movie_genres.genre.apply(lambda x: genre in x)


In [18]:
genres

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [19]:
train_x

Unnamed: 0,user_id,movie_id,u_min,m_min,u_max,m_max,u_mean,m_mean,is_(no genres listed),is_Action,...,is_Film-Noir,is_Horror,is_IMAX,is_Musical,is_Mystery,is_Romance,is_Sci-Fi,is_Thriller,is_War,is_Western
0,1,594,5.0,0.5,5.0,5.0,5.000000,3.604839,False,False,...,False,False,False,True,False,False,False,False,False,False
1,1,370,5.0,0.5,5.0,5.0,5.000000,3.009091,False,True,...,False,False,False,False,False,False,False,False,False,False
2,1,355,5.0,0.5,5.0,5.0,5.000000,2.534483,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,539,5.0,0.5,5.0,5.0,5.000000,3.610169,False,False,...,False,False,False,False,False,True,False,False,False,False
4,1,586,5.0,0.5,5.0,5.0,5.000000,3.069820,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127825,1053,2194,0.5,0.5,5.0,5.0,4.117021,3.851562,False,True,...,False,False,False,False,False,False,False,False,False,False
127826,1053,2302,0.5,0.5,5.0,5.0,4.117021,3.588785,False,False,...,False,False,False,False,False,False,False,False,False,False
127827,1053,4886,0.5,1.0,5.0,5.0,4.117021,3.899225,False,False,...,False,False,False,False,False,False,False,False,False,False
127828,1053,3147,0.5,2.0,5.0,5.0,4.117021,4.088496,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
# 特徴量としては使わない情報を削除
train_x = train_x.drop(columns=["user_id", "movie_id"])
test_x = test_x.drop(columns=["user_id", "movie_id"])

In [21]:
train_x

Unnamed: 0,u_min,m_min,u_max,m_max,u_mean,m_mean,is_(no genres listed),is_Action,is_Adventure,is_Animation,...,is_Film-Noir,is_Horror,is_IMAX,is_Musical,is_Mystery,is_Romance,is_Sci-Fi,is_Thriller,is_War,is_Western
0,5.0,0.5,5.0,5.0,5.000000,3.604839,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
1,5.0,0.5,5.0,5.0,5.000000,3.009091,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5.0,0.5,5.0,5.0,5.000000,2.534483,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,5.0,0.5,5.0,5.0,5.000000,3.610169,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5.0,0.5,5.0,5.0,5.000000,3.069820,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127825,0.5,0.5,5.0,5.0,4.117021,3.851562,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
127826,0.5,0.5,5.0,5.0,4.117021,3.588785,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
127827,0.5,1.0,5.0,5.0,4.117021,3.899225,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
127828,0.5,2.0,5.0,5.0,4.117021,4.088496,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
train_y

0         5.0
1         5.0
2         5.0
3         5.0
4         5.0
         ... 
127825    4.0
127826    4.5
127827    5.0
127828    3.0
127829    0.5
Name: rating, Length: 127830, dtype: float64

In [23]:
test_x

Unnamed: 0,u_min,m_min,u_max,m_max,u_mean,m_mean,is_(no genres listed),is_Action,is_Adventure,is_Animation,...,is_Film-Noir,is_Horror,is_IMAX,is_Musical,is_Mystery,is_Romance,is_Sci-Fi,is_Thriller,is_War,is_Western
0,5.0,1.0,5.0,4.5,5.000000,2.852941,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,5.0,2.0,5.0,5.0,5.000000,3.394737,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,5.0,0.5,5.0,5.0,5.000000,3.650000,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
3,5.0,0.5,5.0,5.0,5.000000,2.824074,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5.0,0.5,5.0,5.0,5.000000,3.058511,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.5,1.0,5.0,5.0,4.117021,4.120192,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
4996,0.5,2.0,5.0,5.0,4.117021,4.051282,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4997,0.5,0.5,5.0,5.0,4.117021,3.997389,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4998,0.5,1.5,5.0,5.0,4.117021,4.171986,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False


In [24]:
test_y

0       5.0
1       5.0
2       5.0
3       5.0
4       5.0
       ... 
4995    5.0
4996    5.0
4997    4.5
4998    5.0
4999    4.5
Name: rating, Length: 5000, dtype: float64

In [25]:
# ハイパーパラメータの設定

lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(test_x, test_y, reference=lgb_train)


lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'rmse',
        'metric': 'rmse',
        'learning_rate': 0.03,
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'feature_fraction': 1.0,
        'max_bin': 255,
        'num_boost_round': 1000,
        'random_state': 42,
        'verbose': -1,
    }

In [26]:
# モデルの学習
model = lgb.train(lgb_params,
                  lgb_train,
                  num_boost_round=10000,
                  valid_sets=[lgb_train, lgb_eval],
                  valid_names=['train', 'valid'],
                  callbacks=[lgb.early_stopping(10),
                             lgb.log_evaluation(10),
                             ])



Training until validation scores don't improve for 10 rounds
[10]	train's rmse: 0.961778	valid's rmse: 1.00979
[20]	train's rmse: 0.907821	valid's rmse: 0.974516
[30]	train's rmse: 0.875909	valid's rmse: 0.956718
[40]	train's rmse: 0.857063	valid's rmse: 0.94846
[50]	train's rmse: 0.845851	valid's rmse: 0.94529
[60]	train's rmse: 0.839025	valid's rmse: 0.944347
Early stopping, best iteration is:
[59]	train's rmse: 0.839564	valid's rmse: 0.944331


In [27]:
y_valid_pred = model.predict(test_x, num_iteration=model.best_iteration)
score = mean_squared_error(test_y, y_valid_pred) ** 0.5
print(f'RMSE valid: {score:.6f}')

RMSE valid: 0.944331


In [28]:
y_valid_pred

array([3.99675348, 4.18541456, 4.41173519, ..., 4.19293879, 4.25814938,
       4.05252798])