In [1]:
import joblib
# from sympy.core.parameters import distribute

model = joblib.load('model/mlp_model.pkl')

In [2]:
model.get_params()

{'activation': 'relu',
 'alpha': 0.01,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (700,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 300,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 3,
 'warm_start': False}

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv("../final_dataset.csv")

X = df.review_text
num_features = 3000
vect = TfidfVectorizer(max_features=num_features)
X_train_dtm = vect.fit_transform(X)

# 为每条评论添加原始行号
df = df.reset_index()

# 按 game_id 进行分组，并计算每组的评论数量和下标范围
comment_summary = df.groupby('app_id').agg(
    count=('index', 'size'),
    start_index=('index', 'min'),
    end_index=('index', 'max')
).reset_index()
# 打印结果
print(comment_summary)

    app_id  count  start_index  end_index
0       10  11505            0      11504
1      240  17181       872609     889789
2      400  16144      1403849    1419992
3      440  12112      1447712    1459823
4      550  50306      1518978    1569283
..     ...    ...          ...        ...
81  359550  17529      1287924    1305452
82  363970  14105      1305453    1319557
83  381210  18428      1319558    1337985
84  383870  14370      1337986    1352355
85  391540  51493      1352356    1403848

[86 rows x 4 columns]


In [20]:
# get emotion distributes
emotion_distributes=[]
for i in comment_summary.itertuples():
    emo_distribute = [int(i.app_id), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # [0.0,0.1), [0.1,0.2), ...[0.9, 1.0]
    for j in range(i.count):
        result = model.predict_proba(X_train_dtm[int(i.start_index)+j])
        # print(result[0][0])
        if result[0][0] == 1.0:
            emo_distribute[10] += 1
        else:
            index = int(result[0][0]*10)
            emo_distribute[index+1] += 1
    emotion_distributes.append(emo_distribute)             

[[10, 8090, 1869, 796, 485, 126, 51, 50, 14, 11, 13],
 [240, 12746, 2422, 925, 495, 262, 129, 83, 51, 51, 17],
 [400, 12487, 2319, 772, 318, 105, 72, 51, 14, 5, 1],
 [440, 9232, 1493, 634, 323, 199, 94, 64, 45, 19, 9],
 [550, 40385, 5871, 2135, 925, 443, 270, 148, 80, 37, 12],
 [570, 49409, 11653, 4417, 2600, 1348, 827, 370, 209, 149, 67],
 [620, 30199, 5027, 1837, 820, 337, 170, 89, 51, 7, 9],
 [1250, 17878, 2300, 827, 379, 192, 90, 48, 31, 13, 2],
 [4000, 21739, 3552, 1109, 645, 315, 183, 128, 35, 11, 2],
 [8190, 11743, 1485, 501, 249, 144, 80, 45, 18, 4, 2],
 [8870, 25729, 3212, 1054, 476, 198, 109, 66, 32, 9, 1],
 [22380, 26993, 3408, 1171, 704, 245, 110, 66, 23, 16, 3],
 [48700, 21776, 2720, 894, 471, 171, 99, 63, 43, 15, 5],
 [49520, 17368, 2260, 797, 390, 148, 68, 58, 25, 16, 0],
 [50300, 9724, 1243, 420, 180, 93, 52, 34, 14, 7, 0],
 [104900, 9309, 1343, 411, 203, 107, 58, 27, 13, 10, 3],
 [105600, 66223, 10201, 3858, 2042, 902, 457, 293, 160, 75, 23],
 [107100, 12889, 1106, 367

In [22]:
# normalization
emotion_distributes_pro = []
for i in emotion_distributes:
    total = 0
    for j in range(10):
        total+=i[j+1]
    for j in range(10):
        i[j+1] = i[j+1]/total
    emotion_distributes_pro.append(i)
emotion_distributes_pro

[[10,
  0.7031725336810083,
  0.16245110821382008,
  0.06918730986527596,
  0.042155584528465885,
  0.010951760104302478,
  0.004432855280312908,
  0.004345936549326379,
  0.0012168622338113862,
  0.0009561060408518036,
  0.0011299435028248588],
 [240,
  0.7418660147837728,
  0.14096967580466796,
  0.05383854257610151,
  0.028810895756940808,
  0.015249403410744426,
  0.00750829404574821,
  0.004830917874396135,
  0.002968395320412083,
  0.002968395320412083,
  0.0009894651068040278],
 [400,
  0.77347621407334,
  0.14364469772051536,
  0.047819623389494546,
  0.019697720515361745,
  0.00650396432111001,
  0.0044598612487611496,
  0.0031590683845391476,
  0.000867195242814668,
  0.0003097125867195243,
  6.194251734390486e-05],
 [440,
  0.76221928665786,
  0.1232661822985469,
  0.0523447820343461,
  0.02666776750330251,
  0.01642998678996037,
  0.007760898282694848,
  0.005284015852047556,
  0.0037153236459709377,
  0.0015686922060766182,
  0.0007430647291941875],
 [550,
  0.802786943903

In [23]:
import pandas as pd
df_result = pd.DataFrame(emotion_distributes_pro, columns=["app_id", "0.0-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5", "0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-1.0"])
df_result.to_csv('../model_1_output.csv', index=False)