In [None]:
import numpy as np
import pandas as pd
import os

from tqdm import tqdm
#import torch
#import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

cwd = os.getcwd()

In [None]:
def my_model(cwd):
    
    ### Data Read
    
    if cwd == ps.path.join(os.getcwd(),  'UX선행기획팀'):
        data_dir = cwd
    else:
        data_dir = os.path.join(cwd, 'UX선행기획팀')
        
    output = pd.read_csv(os.path.join(data_dir, 'output_sample.csv'))
    ## Read game datasets
    game_A = pd.read_csv(os.path.join(data_dir, 'game_A.csv'))
    game_B = pd.read_csv(os.path.join(data_dir, 'game_B.csv'))
    game_C = pd.read_csv(os.path.join(data_dir, 'game_C.csv'))

    ## Create id list
    user_A = set(game_A['id'])
    user_B = set(game_B['id'])
    user_C = set(game_C['id'])

    user_AnB = set(game_A['id']) & set(game_B['id'])
    user_AnC = set(game_A['id']) & set(game_C['id'])
    
    # Convert 'sdate' feature to datetime type.
    game_A['sdate'] = pd.to_datetime(game_A['sdate'])
    game_B['sdate'] = pd.to_datetime(game_B['sdate'])
    game_C['sdate'] = pd.to_datetime(game_C['sdate'])

    game_A = game_A.sort_values(by='sdate', ascending=True)
    game_B = game_B.sort_values(by='sdate', ascending=True)
    game_C = game_C.sort_values(by='sdate', ascending=True)

    features = ['on_count']
    for col in game_A.columns:
        if (col != 'sdate') & (col != 'id'):
            features.append(str("cum_"+col))

    features.append('weekday_cum_playtime')
    features.append('weekend_cum_playtime')
    features.append('total_cum_playtime_B')
    features.append('total_cum_playtime_C')

    df_A = pd.DataFrame(np.nan, index=user_A, columns=features)

    # Compute weekdays & week-end average play time(apt) in game A
    # Compute  cumulative play time(cpt) and average play time(apt) in game A
    weekday = game_A.loc[game_A['sdate'].dt.weekday < 5]
    weekend = game_A.loc[game_A['sdate'].dt.weekday >= 5]
    wd_apt = pd.DataFrame(weekday['totalplaytime'].groupby(weekday['id']).sum())
    we_apt = pd.DataFrame(weekend['totalplaytime'].groupby(weekend['id']).sum())
    weekday_users = list(set(weekday['id']))
    weekend_users = list(set(weekend['id']))

    for _id in tqdm(weekday_users):
        df_A.loc[_id, 'weekday_cum_playtime'] = wd_apt.loc[_id][0]

    for _id in tqdm(weekend_users):
        df_A.loc[_id, 'weekend_cum_playtime'] = we_apt.loc[_id][0]

    for col in game_A:
        if col == 'id':
            pass
        elif col == 'sdate':
            D = pd.DataFrame(game_A[col].groupby(game_A['id']).count())
            print('on_count')
            for _id in tqdm(user_A):
                df_A.loc[_id, 'on_count'] = D.loc[_id][0]
        else:
            feat = str('cum_'+col)
            D = pd.DataFrame(game_A[col].groupby(game_A['id']).sum())
            print(feat)
            for _id in tqdm(user_A):
                if pd.isna(D.loc[_id,:])[0] == True:
                    df_A.loc[_id, feat] = 0.0
                else:
                    df_A.loc[_id, feat] = D.loc[_id][0]

    game_B_cumPlaytime = pd.DataFrame(game_B['totalplaytime'].groupby(game_B['id']).sum())
    for _id in tqdm(user_AnB):
        df_A.loc[_id, 'total_cum_playtime_B'] = game_B_cumPlaytime.loc[_id][0]

    game_C_cumPlaytime = pd.DataFrame(game_C['totalplaytime'].groupby(game_C['id']).sum())
    for _id in tqdm(user_AnC):
        df_A.loc[_id, 'total_cum_playtime_C'] = game_C_cumPlaytime.loc[_id][0]

    allday_player = set(df_A[df_A['on_count'] >= 31.0].index)
    for _id in tqdm(user_A):
        if _id in allday_player:
            df_A.loc[_id, 'is_allday_player'] = 1.0
        else:
            df_A.loc[_id, 'is_allday_player'] = 0.0

    # Add day averaged value
    for col in df_A.columns:
        if col.startswith('cum_'):
            df_A[str("avg_"+col[4:])] = df_A[col]/df_A['on_count']

    df_A['weekday_day_playtime'] = df_A['weekday_cum_playtime']/df_A['on_count']
    df_A['weekend_day_playtime'] = df_A['weekend_cum_playtime']/df_A['on_count']
    
    if ('passionate_B' in df_A.columns) or ('passionate_C' in df_A.columns):
    df_A = df_A.drop(['passionate_B', 'passionate_C'], axis=1)

    # Cumulative Play time top 75% user id list
    likeB_id = df_A[df_A['total_cum_playtime_B'] >= df_A.loc[user_AnB, 'total_cum_playtime_B'].quantile(.75)].index
    likeC_id = df_A[df_A['total_cum_playtime_C'] >= df_A.loc[user_AnC, 'total_cum_playtime_C'].quantile(.75)].index

    # Assign label 1 to Cumulative Play time top 25% users, else 0.
    df_A.loc[likeB_id, 'passionate_B'] = 1
    df_A.loc[likeC_id, 'passionate_C'] = 1
    
    A = df_A.replace(np.nan, 0)
    
    ### Feature Engineering
    
    #1. Who bored at game A >>> high game online count (top 75%), low average max_level, constant low average quest count(non-zero)
    bored_id = A[(A['on_count'] > A['on_count'].quantile(.75)) & (A['avg_max_level'] < A['avg_max_level'].quantile(.50)) & (A['avg_quest_count'] < A['avg_quest_count'].quantile(.5)) & (A['avg_quest_count'] > 0.0)].index
    A.loc[bored_id,'bored_at_A'] = 1

    #2. Who is hardcore player >>> high max_level, but high dead_count
    hardcore_id = A[(A['avg_max_level'] > A['avg_max_level'].quantile(.90)) & (A['avg_dead_count'] >= A['avg_dead_count'].quantile(.95))].index
    A.loc[hardcore_id,'hardcore_player'] = 1

    #3. Who enjoy the storyline. >>> high npc_count and high quest_count
    story_id = A[(A['avg_npc_count'] >= A['avg_npc_count'].quantile(.90)) & (A['avg_quest_count'] >= A['avg_quest_count'].quantile(.90))].index
    A.loc[story_id, 'story_player'] = 1

    #4. Who spend gamemoney a lot >>> high gamemoneyuse_count
    moneyspender_id = A[(A['avg_gamemoneyuse_count'] >= A['avg_gamemoneyuse_count'].quantile(.95))].index
    A.loc[moneyspender_id, 'gamemoney_spender'] = 1

    A = A.replace(np.nan, 0)
    
    from sklearn.ensemble import RandomForestClassifier

    for y_col in ['passionate_B', 'passionate_C']:

        if y_col == 'passionate_B':
            train = A.loc[user_AnB].drop(['total_cum_playtime_B'], axis=1).copy()
            test = A.drop(['total_cum_playtime_B'], axis=1).copy()
            test = test.drop(user_AnB, axis=0)

        if y_col == 'passionate_C':
            train = A.loc[user_AnC].drop(['total_cum_playtime_C'], axis=1).copy()
            test = A.drop(['total_cum_playtime_C'], axis=1).copy()
            test = test.drop(user_AnC, axis=0)

        sc = StandardScaler()

        train_y = train[y_col].copy()
        train_x = train.drop([y_col], axis=1).copy()
        train_x = sc.fit_transform(train_x)

        test_x = test.drop([y_col], axis=1).copy()
        test_x = sc.fit_transform(test_x)

        ## Oversampling for Class-imbalanced Problem
        from imblearn.over_sampling import SMOTE 
        sm = SMOTE(random_state=42, k_neighbors=7)

        train_x, train_y = sm.fit_resample(train_x, train_y)

        rf = RandomForestClassifier(random_state=0)
        rf.fit(train_x, train_y)
        pred_y = rf.predict(test_x)

        A.loc[test.index,y_col] = pred_y
        
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.optimizers import Adam, SGD
    from tensorflow.keras.layers import Dropout

    from tensorflow.keras.layers import Activation
    from tensorflow.keras.layers import Conv1D
    from tensorflow.keras.layers import Dropout
    from tensorflow.keras.layers import Flatten
    from tensorflow.keras.callbacks import EarlyStopping
    from sklearn.ensemble import GradientBoostingRegressor
    
    for y_col in ['passionate_B', 'passionate_C']:
        
        # extract train data: user_AnB or user_AnC data
        if y_col == 'passionate_B':
            train = A.loc[user_AnB].drop(['total_cum_playtime_B'], axis=1).copy()
            test = A.drop(['total_cum_playtime_B'], axis=1).copy()
            test = test.drop(user_AnB, axis=0)

        if y_col == 'passionate_C':
            train = A.loc[user_AnC].drop(['total_cum_playtime_C'], axis=1).copy()
            test = A.drop(['total_cum_playtime_C'], axis=1).copy()
            test = test.drop(user_AnC, axis=0)

        sc = StandardScaler()

        train_y = train[y_col].copy()
        train_x = train.drop([y_col], axis=1).copy()
        train_x = sc.fit_transform(train_x)

        test_x = test.drop([y_col], axis=1).copy()
        test_x = sc.fit_transform(test_x)

        ## Oversampling for Class-imbalanced Problem
        from imblearn.over_sampling import SMOTE 
        sm = SMOTE(random_state=42, k_neighbors=7)
        train_x, train_y = sm.fit_resample(train_x, train_y)

        ## Model training with oversampled dataset
        N = train_x.shape[0]
        M = train_x.shape[1]
        train_x = train_x.reshape(N, M, 1).copy()

        model = Sequential()
        model.add(Conv1D(128, 8, input_shape=(M, 1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dropout(0.5))
        model.add(Dense(2048, activation='relu'))
        model.add(Dense(1024, activation='relu'))
        model.add(Dense(1))

        optimizer = SGD(0.01)
        model.compile(optimizer=optimizer, loss='mse')

        early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=100)
        history = model.fit(x=np.array(train_x),
                          y=np.array(train_y),
                          validation_split = 0.3,
                          batch_size=128,
                          epochs=1000,
                          callbacks=[early_stop])

        losses = pd.DataFrame(history.history)
        losses.plot()

        dnn = model

        ## Reload dataset (without smote) Fit and Predict Probability with original dataset
        train_y = train[y_col].copy()
        train_x = train.drop([y_col], axis=1).copy()
        train_x = sc.fit_transform(train_x)

        test_x = test.drop([y_col], axis=1).copy()
        test_x = sc.fit_transform(test_x)

        N = train_x.shape[0]
        M = train_x.shape[1]
        train_x = train_x.reshape(N, M, 1).copy()

        N = test_x.shape[0]
        M = test_x.shape[1]
        test_x = test_x.reshape(N, M, 1).copy()

        prob_train = dnn.predict(train_x)
        prob_test = dnn.predict(test_x)

        ## Scale the prediction between 0 and 1
        msc = MinMaxScaler()
        prob_train = msc.fit_transform(prob_train)
        prob_test = msc.fit_transform(prob_test)

        feat_name = str("prob_"+y_col)
        A.loc[test.index, feat_name] = prob_test.reshape(len(test_x),)
        A.loc[train.index, feat_name] = prob_train.reshape(len(train_x),)
        
    

    for y_col in ['total_cum_playtime_B', 'total_cum_playtime_C']:

        sc = StandardScaler()

        if y_col == 'total_cum_playtime_B':
            train = A.loc[user_AnB].copy()
            test = A.drop(user_AnB, axis=0)
        elif y_col == 'total_cum_playtime_C':
            train = A.loc[user_AnC].copy()
            test = A.drop(user_AnC, axis=0)

        train_y = train[y_col].copy()
        train_x = train.drop([y_col], axis=1).copy()
        train_x = sc.fit_transform(train_x)

        test_x = test.drop([y_col], axis=1).copy()
        test_x = sc.fit_transform(test_x)

        if y_col == 'total_cum_playtime_B':

            # Select model: Gradient Boositing Regressor
            gdb = GradientBoostingRegressor(random_state=0, n_estimators=500)
            gdb.fit(train_x, train_y)
            pred_train = gdb.predict(train_x)
            pred_test = gdb.predict(test_x)

        elif y_col == 'total_cum_playtime_C':

            N = train_x.shape[0]
            M = train_x.shape[1]
            train_x = train_x.reshape(N, M, 1).copy()

            N = test_x.shape[0]
            M = test_x.shape[1]
            test_x = test_x.reshape(N, M, 1).copy()

            # Select model: CNN

            model = Sequential()
            model.add(Conv1D(64, 8, input_shape=(M, 1)))
            model.add(Activation('relu'))
            model.add(Conv1D(64, 8, input_shape=(M, 1)))
            model.add(Activation('relu'))
            model.add(Flatten())
            model.add(Dropout(0.5))
            model.add(Dense(2048, activation='relu'))
            model.add(Dense(1024, activation='relu'))
            model.add(Dense(1))
            optimizer = Adam(.0001)
            model.compile(optimizer=optimizer, loss='mse')


            early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=100)
            history = model.fit(x=np.array(train_x),
                              y=np.array(train_y),
                              validation_split = 0.2,
                              batch_size=512,
                              epochs=1000,
                              callbacks=[early_stop])

            losses = pd.DataFrame(history.history)

            cnn = model
            pred_train = cnn.predict(train_x)
            pred_test = cnn.predict(test_x)

        # processing result value: remove minus time and remove maximum monthly available time
        pred_train[pred_train < 0] = 0
        pred_test[pred_test < 0] = 0

        pred_train[pred_train > (31*24*60)] = (31*24*60)
        pred_test[pred_test > (31*24*60)] = (31*24*60)

        feat_name = str("pred_"+y_col)
        A.loc[train.index, feat_name] = pred_train.reshape(len(train_x),)
        A.loc[test.index, feat_name] = pred_test.reshape(len(test_x),)
        
    result = A[['pred_total_cum_playtime_B', 'prob_passionate_B', 'pred_total_cum_playtime_C', 'prob_passionate_C']]
    output['A게임 유저ID'] = result.index
    output.iloc[:,1:] = np.array(result.iloc[:,:])
    
    
    return output

In [None]:
cwd = os.getcwd()
output = my_model(cwd)