## 더미 유저 만들기

In [1]:
pip install faker





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

In [8]:
# 한국인
fake = Faker('ko_KR')

current_date = datetime.now()
data = []

for user_number in range(1, 1001): # 생성할 더미 유저 수
    user_id = str(user_number)
    username = fake.name()
    
    # id 생성
    id_prefix = fake.lexify('?????') # 영어 다섯자리
    user_id_suffix = user_id.zfill(5) # user_id를 다섯자리로 채우기
    user_id_combined = id_prefix + user_id_suffix
    
    # pd 생성
    password = str(random.randint(1000, 9999))
    
    # 나이 비율
    age_distribution = random.choices(['10대', '20대', '30대', '40대', '50대', '60대'],
                                      weights=[0.20, 0.30, 0.20, 0.20, 0.05, 0.05])[0] # 상황에 맞게 조정
    if age_distribution == '10대':
        birthdate = current_date - timedelta(days=random.randint(10 * 365, 19 * 365))
    elif age_distribution == '20대':
        birthdate = current_date - timedelta(days=random.randint(20 * 365, 29 * 365))
    elif age_distribution == '30대':
        birthdate = current_date - timedelta(days=random.randint(30 * 365, 39 * 365))
    elif age_distribution == '40대':
        birthdate = current_date - timedelta(days=random.randint(40 * 365, 49 * 365))
    elif age_distribution == '50대':
        birthdate = current_date - timedelta(days=random.randint(50 * 365, 59 * 365))
    else:
        birthdate = current_date - timedelta(days=random.randint(60 * 365, 69 * 365))

    age = (current_date - birthdate).days // 365
    
    gender = random.choice(['남', '여'])

    # liked_movie, movie_score 컬럼을 생성하고 우선 NaN으로 배정
    liked_movie = movie_score = float('nan')

    user_data = {
        'user_id': user_id,
        'username': username,
        'id': user_id_combined,
        'password': password,
        'birthdate': birthdate.strftime('%Y-%m-%d'),
        'age': age,
        'gender': gender,
        'liked_movie': liked_movie,
        'movie_score': movie_score
    }
    data.append(user_data)

df = pd.DataFrame(data)

df.sample(10)

Unnamed: 0,user_id,username,id,password,birthdate,age,gender,liked_movie,movie_score
612,613,황지후,VDrFf00613,2169,2005-11-04,18,여,,
336,337,주주원,cFsYe00337,5580,1980-07-04,43,여,,
723,724,송성현,teKHS00724,3984,1987-05-29,36,여,,
220,221,문윤서,duVir00221,3521,1977-05-07,46,여,,
318,319,이건우,FZmQv00319,8205,2006-10-04,17,여,,
966,967,최우진,NETaX00967,9647,1990-10-07,33,남,,
502,503,김지연,zJzEL00503,1214,1986-08-13,37,남,,
720,721,이경자,MJlye00721,6688,2003-11-18,20,남,,
39,40,홍중수,XdUhM00040,9976,1982-08-21,41,여,,
972,973,김은영,oqfaM00973,7065,1997-08-26,26,여,,


In [9]:
# 10대 수
df[(df['age'] >= 10) & (df['age'] <= 19)].count()

user_id        224
username       224
id             224
password       224
birthdate      224
age            224
gender         224
liked_movie      0
movie_score      0
dtype: int64

## 유저에게 랜덤하게 좋아하는 영화, 영화 평점 부여하기

#### 영화 정보 데이터
 - https://grouplens.org/datasets/movielens/latest/

In [10]:
df_mv = pd.read_csv('movies.csv') # 영화 정보
df_rt = pd.read_csv('ratings.csv') # 영화 평점

In [19]:
df_mv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [20]:
df_rt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [21]:
# 영화별 평균 평점 구하기

# 영화 평점, 영화 정보 데이터를 'movieId'컬럼을 기준으로 병합
merged_data = pd.merge(df_rt, df_mv, on='movieId', how='inner')

# 영화별 평균 평점
avg_ratings = merged_data.groupby('title')['rating'].mean().reset_index()
avg_ratings = avg_ratings.sort_values(by='rating', ascending=False)
avg_ratings

Unnamed: 0,title,rating
3354,Gena the Crocodile (1969),5.0
8951,True Stories (1986),5.0
1986,Cosmic Scrat-tastrophe (2015),5.0
5269,Love and Pigeons (1985),5.0
7033,Red Sorghum (Hong gao liang) (1987),5.0
...,...,...
2482,Don't Look Now (1973),0.5
4617,Journey 2: The Mysterious Island (2012),0.5
4581,Joe Dirt 2: Beautiful Loser (2015),0.5
4556,Jesus Christ Vampire Hunter (2001),0.5


In [22]:
# 평점이 없는 영화 찾기
df_nrt = pd.merge(df_mv, df_rt, on='movieId', how='left')
df_nrt[df_nrt['rating'].isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 22820 to 92639
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    18 non-null     int64  
 1   title      18 non-null     object 
 2   genres     18 non-null     object 
 3   userId     0 non-null      float64
 4   rating     0 non-null      float64
 5   timestamp  0 non-null      float64
dtypes: float64(3), int64(1), object(2)
memory usage: 1008.0+ bytes


## 더미 유저가 좋아하는 영화 랜덤으로 추가하기

In [27]:
# 평점 4점 이상 영화만 고르기
liked_movie = avg_ratings[avg_ratings['rating'] >= 4.0]

# 더미 유저에 랜덤 영화 부여
for index, row in df.iterrows():
    random_movie = liked_movie.sample()
    random_movie_title = random_movie['title'].values[0]
    random_movie_score = random_movie['rating'].values[0]
    df.at[index, 'liked_movie'] = random_movie_title
    df.at[index, 'movie_score'] = random_movie_score

df

Unnamed: 0,user_id,username,id,password,birthdate,age,gender,liked_movie,movie_score
0,1,오영진,njKdk00001,6849,1997-03-25,26,여,All the Vermeers in New York (1990),5.000000
1,2,장경숙,VlZBl00002,7537,2012-11-21,11,여,12 Angry Men (1997),5.000000
2,3,김영순,bOiaP00003,2073,1999-11-15,24,여,"Killer, The (Die xue shuang xiong) (1989)",4.000000
3,4,차은정,UOMxA00004,6157,1977-04-06,46,여,Killing Me Softly (2002),4.000000
4,5,이시우,cexha00005,6782,1989-02-03,35,여,Nebraska (2013),4.125000
...,...,...,...,...,...,...,...,...,...
995,996,양준호,QziRk00996,6413,2008-03-03,15,남,Bliss (2012),4.500000
996,997,황예지,ukNfx00997,1424,1957-02-04,67,남,Everest (1998),4.250000
997,998,정미영,CuGGZ00998,3503,1972-04-11,51,남,Why Man Creates (1968),4.000000
998,999,손광수,rozIJ00999,6497,2001-02-19,23,남,Frank (2014),4.333333
