In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score, confusion_matrix, \
    ConfusionMatrixDisplay, balanced_accuracy_score, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import itertools
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier, XGBRegressor
from scipy.stats import uniform, randint
import xgboost as xgb
from sklearn import metrics


In [2]:
train_data = pd.read_csv("../train.csv")
test_data = pd.read_csv("../test.csv")

print(train_data.head())
print(test_data.head())

                                                user  track  time
0  ee8621197232afef4ae573079d64480ba7640c9eb91a7e...  41378   1.0
1  9b19f9ab816598a0809e4afd5d60800f2dbef9cbb9b03a...  44158   0.0
2  be3d629f02589a093027c0b917aa7668f22b2f89e83328...   1263   0.8
3  3d810b358ef3f88619945df4182acda92a24e99c8b28e8...   3781   0.0
4  2261b761b06d13e4ca4118d58d91eb7312ae864b2916d7...  32711   0.0
                                                user  track
0  6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...   4218
1  6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...  19007
2  6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...    346
3  6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...   8822
4  6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...  29271


In [3]:
user_avg_time = train_data.groupby('user')['time'].mean().reset_index()
user_avg_time.columns = ['user', 'user_avg_time']

user_track_count = train_data.groupby('user')['track'].count().reset_index()
user_track_count.columns = ['user', 'user_track_count']

track_avg_time = train_data.groupby('track')['time'].mean().reset_index()
track_avg_time.columns = ['track', 'track_avg_time']

track_listen_count = train_data.groupby('track').size().reset_index(name='track_listen_count')



In [4]:
test_data = test_data.merge(user_avg_time, on='user', how='left')
test_data = test_data.merge(user_track_count, on='user', how='left')
test_data = test_data.merge(track_avg_time, on='track', how='left')
test_data = test_data.merge(track_listen_count, on='track', how='left')
test_data

Unnamed: 0,user,track,user_avg_time,user_track_count,track_avg_time,track_listen_count
0,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,4218,0.206739,92,0.482623,61
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,19007,0.206739,92,0.175429,35
2,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,346,0.206739,92,0.209697,33
3,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,8822,0.206739,92,0.137083,24
4,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,29271,0.206739,92,0.111463,41
...,...,...,...,...,...,...
43255,888df25ae35772424a560c7152a1de794440e0ea5cfee6...,36867,0.137009,117,0.183939,33
43256,888df25ae35772424a560c7152a1de794440e0ea5cfee6...,5948,0.137009,117,0.228485,33
43257,888df25ae35772424a560c7152a1de794440e0ea5cfee6...,46988,0.137009,117,0.104444,18
43258,888df25ae35772424a560c7152a1de794440e0ea5cfee6...,18083,0.137009,117,0.163000,30


In [5]:
train_data = train_data.merge(user_avg_time, on='user', how='left')
train_data = train_data.merge(user_track_count, on='user', how='left')
train_data = train_data.merge(track_avg_time, on='track', how='left')
train_data = train_data.merge(track_listen_count, on='track', how='left')
train_data

Unnamed: 0,user,track,time,user_avg_time,user_track_count,track_avg_time,track_listen_count
0,ee8621197232afef4ae573079d64480ba7640c9eb91a7e...,41378,1.00,0.169500,180,0.214737,38
1,9b19f9ab816598a0809e4afd5d60800f2dbef9cbb9b03a...,44158,0.00,0.148226,124,0.182069,29
2,be3d629f02589a093027c0b917aa7668f22b2f89e83328...,1263,0.80,0.201677,167,0.523857,70
3,3d810b358ef3f88619945df4182acda92a24e99c8b28e8...,3781,0.00,0.262101,138,0.201351,37
4,2261b761b06d13e4ca4118d58d91eb7312ae864b2916d7...,32711,0.00,0.189217,166,0.173600,25
...,...,...,...,...,...,...,...
1671406,9e558a7dc777d285f1384223cdb889ab060244c1826ec7...,4729,0.00,0.179826,172,0.096364,22
1671407,7a84ae249fa744b8c1acb6c5247c2cf443e31870aa7217...,20548,0.00,0.165222,180,0.148214,28
1671408,1763888c1a5b2655976fd20c3e898415b42a3afa38bd28...,35971,0.00,0.183864,132,0.143571,28
1671409,3c4dcf6dfc899bd68a7f7961e7ca5a61d2d71d500f9785...,34927,0.03,0.287872,141,0.144167,36


In [8]:
test_data.isna().sum().sum()

0

In [9]:
train_data.isna().sum().sum()

0

In [10]:
train_data.to_csv("train.csv")
test_data.to_csv("test.csv")