In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('Data/train_age_dataset.csv')
len(train)

488877

In [3]:
test = pd.read_csv('Data/test_age_dataset.csv')
len(test)

54320

In [4]:
sample = pd.read_csv('Data/sample_submission.csv')
sample.head()

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,1


## Cleaning

In [5]:
train.isnull().sum()

Unnamed: 0                         0
userId                             0
tier                               0
gender                             0
following_rate                     0
followers_avg_age                  0
following_avg_age                  0
max_repetitive_punc                0
num_of_hashtags_per_action         0
emoji_count_per_action             0
punctuations_per_action            0
number_of_words_per_action         0
avgCompletion                      0
avgTimeSpent                       0
avgDuration                        0
avgComments                        0
creations                          0
content_views                      0
num_of_comments                    0
weekends_trails_watched_per_day    0
weekdays_trails_watched_per_day    0
slot1_trails_watched_per_day       0
slot2_trails_watched_per_day       0
slot3_trails_watched_per_day       0
slot4_trails_watched_per_day       0
avgt2                              0
age_group                          0
d

In [6]:
test.isnull().sum()

Unnamed: 0                         0
userId                             0
tier                               0
gender                             0
following_rate                     0
followers_avg_age                  0
following_avg_age                  0
max_repetitive_punc                0
num_of_hashtags_per_action         0
emoji_count_per_action             0
punctuations_per_action            0
number_of_words_per_action         0
avgCompletion                      0
avgTimeSpent                       0
avgDuration                        0
avgComments                        0
creations                          0
content_views                      0
num_of_comments                    0
weekends_trails_watched_per_day    0
weekdays_trails_watched_per_day    0
slot1_trails_watched_per_day       0
slot2_trails_watched_per_day       0
slot3_trails_watched_per_day       0
slot4_trails_watched_per_day       0
avgt2                              0
dtype: int64

# WHY ARE THERE NO NULL VALUES

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,userId,tier,gender,following_rate,followers_avg_age,following_avg_age,max_repetitive_punc,num_of_hashtags_per_action,emoji_count_per_action,...,content_views,num_of_comments,weekends_trails_watched_per_day,weekdays_trails_watched_per_day,slot1_trails_watched_per_day,slot2_trails_watched_per_day,slot3_trails_watched_per_day,slot4_trails_watched_per_day,avgt2,age_group
0,265153,48958844,2,1,0.0,0.0,0.0,0,0.0,0.0,...,0.2,0.0,0.041667,0.025,0.0,0.0,0.175,0.033333,0.0,1
1,405231,51100441,2,2,0.0,0.0,0.0,0,0.0,0.0,...,0.09322,0.0,0.012712,0.018644,0.0,0.084746,0.0,0.033898,82.5,2
2,57867,6887426,2,1,0.0,0.0,0.0,0,0.0,0.0,...,0.002786,0.0,0.0,0.000557,0.0,0.002786,0.0,0.0,0.0,1
3,272618,50742404,2,1,0.0,0.0,0.0,0,0.0,0.0,...,0.008403,0.0,0.0,0.001681,0.0,0.0,0.0,0.008403,0.0,1
4,251123,45589200,2,2,0.0,0.0,0.0,0,0.0,0.0,...,0.204918,0.0,0.0,0.04918,0.0,0.008197,0.057377,0.180328,0.0,1


Oh looks like the null values have been filled with zeros

In [11]:
unnamed = train['Unnamed: 0'].value_counts()
unnamed.value_counts()

1    488877
Name: Unnamed: 0, dtype: int64

Dropping this column, it's redundant

Checking the zero null thing

In [14]:
train.groupby(['followers_avg_age']).size()

followers_avg_age
0.000000    406543
1.000000      7734
1.071429         1
1.090909         2
1.100000         4
             ...  
3.600000         8
3.666667        70
3.750000        10
3.800000         2
4.000000      2301
Length: 3495, dtype: int64

In [16]:
train['followers_avg_age'].value_counts()

0.000000    406543
2.000000     15724
1.000000      7734
3.000000      7446
1.500000      4342
             ...  
2.072119         1
1.916049         1
2.186813         1
1.880435         1
1.830846         1
Name: followers_avg_age, Length: 3495, dtype: int64

While zero definitely is looking like the null value replacement, this column has also been centered around some new mean and its range has been shrunk down

In [15]:
train['age_group'].value_counts()

1    308315
4     60803
3     60404
2     59355
Name: age_group, dtype: int64

Judging by the age group categories given here, I am inclined to believe that people will generally follow people in their own age group

Also I can see that some of the numbers in the avg_age given is cleanly being divided by 3 and some by 5, so I'm gonna guess that to shrink the range, the person divided by 30. To center the mean again, let assume that the lowest age possible will be 10 years. So we subtract the whole thing by 20.

In [17]:
train['followers_avg_age_corrected'] = 30*train['followers_avg_age'] - 20

In [18]:
train.groupby(['followers_avg_age_corrected']).size()

followers_avg_age_corrected
-20.000000     406543
 10.000000       7734
 12.142857          1
 12.727273          2
 13.000000          4
                ...  
 88.000000          8
 90.000000         70
 92.500000         10
 94.000000          2
 100.000000      2301
Length: 3495, dtype: int64

In [40]:
test['followers_avg_age_corrected'] = 30*test['followers_avg_age'] - 20

## Model Testing chumma

In [34]:
from sklearn.metrics import f1_score
import lightgbm as lgb

In [26]:
X = train.drop(['Unnamed: 0', 'userId', 'age_group'], axis = 1)
Y = train['age_group']

In [27]:
from sklearn.model_selection import train_test_split
X_Train, X_CV, Y_Train, Y_CV = train_test_split(X, Y, test_size = 0.15, random_state = 0)

In [49]:
class_weights = {1:308315, 2:59355, 3:60404, 4:60803}
clf = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.17, n_jobs=8, objective = 'multiclass')
clf.fit(X_Train, Y_Train, early_stopping_rounds = 200, eval_set = [(X_Train, Y_Train), (X_CV, Y_CV)], verbose = True)

[1]	training's multi_logloss: 0.958286	valid_1's multi_logloss: 0.955577
Training until validation scores don't improve for 200 rounds
[2]	training's multi_logloss: 0.882877	valid_1's multi_logloss: 0.88027
[3]	training's multi_logloss: 0.826215	valid_1's multi_logloss: 0.823755
[4]	training's multi_logloss: 0.781994	valid_1's multi_logloss: 0.779647
[5]	training's multi_logloss: 0.747292	valid_1's multi_logloss: 0.745082
[6]	training's multi_logloss: 0.718455	valid_1's multi_logloss: 0.716364
[7]	training's multi_logloss: 0.695343	valid_1's multi_logloss: 0.693318
[8]	training's multi_logloss: 0.676224	valid_1's multi_logloss: 0.67428
[9]	training's multi_logloss: 0.66061	valid_1's multi_logloss: 0.658728
[10]	training's multi_logloss: 0.646809	valid_1's multi_logloss: 0.644909
[11]	training's multi_logloss: 0.635898	valid_1's multi_logloss: 0.634153
[12]	training's multi_logloss: 0.626164	valid_1's multi_logloss: 0.624478
[13]	training's multi_logloss: 0.617532	valid_1's multi_loglos

[112]	training's multi_logloss: 0.524198	valid_1's multi_logloss: 0.536953
[113]	training's multi_logloss: 0.523922	valid_1's multi_logloss: 0.53684
[114]	training's multi_logloss: 0.523742	valid_1's multi_logloss: 0.536843
[115]	training's multi_logloss: 0.523527	valid_1's multi_logloss: 0.536777
[116]	training's multi_logloss: 0.523284	valid_1's multi_logloss: 0.53669
[117]	training's multi_logloss: 0.523111	valid_1's multi_logloss: 0.536656
[118]	training's multi_logloss: 0.522893	valid_1's multi_logloss: 0.536603
[119]	training's multi_logloss: 0.522667	valid_1's multi_logloss: 0.536574
[120]	training's multi_logloss: 0.522296	valid_1's multi_logloss: 0.536336
[121]	training's multi_logloss: 0.522089	valid_1's multi_logloss: 0.536304
[122]	training's multi_logloss: 0.521891	valid_1's multi_logloss: 0.536282
[123]	training's multi_logloss: 0.521592	valid_1's multi_logloss: 0.53613
[124]	training's multi_logloss: 0.521148	valid_1's multi_logloss: 0.53585
[125]	training's multi_loglos

[221]	training's multi_logloss: 0.501189	valid_1's multi_logloss: 0.531434
[222]	training's multi_logloss: 0.501022	valid_1's multi_logloss: 0.531435
[223]	training's multi_logloss: 0.500852	valid_1's multi_logloss: 0.531424
[224]	training's multi_logloss: 0.500662	valid_1's multi_logloss: 0.531414
[225]	training's multi_logloss: 0.500522	valid_1's multi_logloss: 0.531404
[226]	training's multi_logloss: 0.500369	valid_1's multi_logloss: 0.531397
[227]	training's multi_logloss: 0.500217	valid_1's multi_logloss: 0.531387
[228]	training's multi_logloss: 0.500046	valid_1's multi_logloss: 0.531366
[229]	training's multi_logloss: 0.499853	valid_1's multi_logloss: 0.531302
[230]	training's multi_logloss: 0.499685	valid_1's multi_logloss: 0.531309
[231]	training's multi_logloss: 0.499525	valid_1's multi_logloss: 0.531314
[232]	training's multi_logloss: 0.499301	valid_1's multi_logloss: 0.531261
[233]	training's multi_logloss: 0.499112	valid_1's multi_logloss: 0.531209
[234]	training's multi_lo

[330]	training's multi_logloss: 0.482931	valid_1's multi_logloss: 0.529157
[331]	training's multi_logloss: 0.482774	valid_1's multi_logloss: 0.529102
[332]	training's multi_logloss: 0.482622	valid_1's multi_logloss: 0.529122
[333]	training's multi_logloss: 0.482476	valid_1's multi_logloss: 0.529073
[334]	training's multi_logloss: 0.482321	valid_1's multi_logloss: 0.529063
[335]	training's multi_logloss: 0.482157	valid_1's multi_logloss: 0.529075
[336]	training's multi_logloss: 0.481992	valid_1's multi_logloss: 0.529065
[337]	training's multi_logloss: 0.481843	valid_1's multi_logloss: 0.529027
[338]	training's multi_logloss: 0.481709	valid_1's multi_logloss: 0.529005
[339]	training's multi_logloss: 0.481551	valid_1's multi_logloss: 0.529
[340]	training's multi_logloss: 0.481394	valid_1's multi_logloss: 0.529043
[341]	training's multi_logloss: 0.481257	valid_1's multi_logloss: 0.529026
[342]	training's multi_logloss: 0.48114	valid_1's multi_logloss: 0.529035
[343]	training's multi_loglos

[441]	training's multi_logloss: 0.466844	valid_1's multi_logloss: 0.528273
[442]	training's multi_logloss: 0.466708	valid_1's multi_logloss: 0.528271
[443]	training's multi_logloss: 0.466571	valid_1's multi_logloss: 0.528296
[444]	training's multi_logloss: 0.46639	valid_1's multi_logloss: 0.528204
[445]	training's multi_logloss: 0.466235	valid_1's multi_logloss: 0.52817
[446]	training's multi_logloss: 0.466128	valid_1's multi_logloss: 0.528177
[447]	training's multi_logloss: 0.465995	valid_1's multi_logloss: 0.528198
[448]	training's multi_logloss: 0.465818	valid_1's multi_logloss: 0.528144
[449]	training's multi_logloss: 0.465667	valid_1's multi_logloss: 0.52816
[450]	training's multi_logloss: 0.465524	valid_1's multi_logloss: 0.528162
[451]	training's multi_logloss: 0.46541	valid_1's multi_logloss: 0.528199
[452]	training's multi_logloss: 0.465279	valid_1's multi_logloss: 0.528215
[453]	training's multi_logloss: 0.465146	valid_1's multi_logloss: 0.528206
[454]	training's multi_loglos

[551]	training's multi_logloss: 0.451861	valid_1's multi_logloss: 0.528163
[552]	training's multi_logloss: 0.451728	valid_1's multi_logloss: 0.528154
[553]	training's multi_logloss: 0.451597	valid_1's multi_logloss: 0.528112
[554]	training's multi_logloss: 0.451465	valid_1's multi_logloss: 0.528135
[555]	training's multi_logloss: 0.45134	valid_1's multi_logloss: 0.528129
[556]	training's multi_logloss: 0.451228	valid_1's multi_logloss: 0.528134
[557]	training's multi_logloss: 0.451097	valid_1's multi_logloss: 0.528111
[558]	training's multi_logloss: 0.450981	valid_1's multi_logloss: 0.528116
[559]	training's multi_logloss: 0.450871	valid_1's multi_logloss: 0.528127
[560]	training's multi_logloss: 0.450744	valid_1's multi_logloss: 0.528142
[561]	training's multi_logloss: 0.450583	valid_1's multi_logloss: 0.528102
[562]	training's multi_logloss: 0.450435	valid_1's multi_logloss: 0.528111
[563]	training's multi_logloss: 0.450318	valid_1's multi_logloss: 0.528104
[564]	training's multi_log

[661]	training's multi_logloss: 0.438516	valid_1's multi_logloss: 0.528217
[662]	training's multi_logloss: 0.43838	valid_1's multi_logloss: 0.528214
[663]	training's multi_logloss: 0.438257	valid_1's multi_logloss: 0.528218
[664]	training's multi_logloss: 0.438138	valid_1's multi_logloss: 0.528225
[665]	training's multi_logloss: 0.438017	valid_1's multi_logloss: 0.528215
[666]	training's multi_logloss: 0.437922	valid_1's multi_logloss: 0.528239
[667]	training's multi_logloss: 0.437814	valid_1's multi_logloss: 0.528271
[668]	training's multi_logloss: 0.437714	valid_1's multi_logloss: 0.528257
[669]	training's multi_logloss: 0.43758	valid_1's multi_logloss: 0.528247
[670]	training's multi_logloss: 0.437472	valid_1's multi_logloss: 0.528252
[671]	training's multi_logloss: 0.437353	valid_1's multi_logloss: 0.528254
[672]	training's multi_logloss: 0.437218	valid_1's multi_logloss: 0.52825
[673]	training's multi_logloss: 0.437119	valid_1's multi_logloss: 0.528263
[674]	training's multi_loglo

LGBMClassifier(learning_rate=0.17, n_estimators=1000, n_jobs=8,
               objective='multiclass')

In [50]:
pred = clf.predict(X_CV)
print(f1_score(Y_CV, pred, average = 'weighted'))

0.7482203984345434


In [51]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.17, n_jobs=8, objective = 'multiclass')
xgb.fit(X_Train, Y_Train, early_stopping_rounds = 200, eval_set = [(X_Train, Y_Train), (X_CV, Y_CV)], verbose = True)

[0]	validation_0-merror:0.29528	validation_1-merror:0.29403
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 200 rounds.
[1]	validation_0-merror:0.29435	validation_1-merror:0.29369
[2]	validation_0-merror:0.29180	validation_1-merror:0.29128
[3]	validation_0-merror:0.28944	validation_1-merror:0.28935
[4]	validation_0-merror:0.28881	validation_1-merror:0.28927
[5]	validation_0-merror:0.28686	validation_1-merror:0.28719
[6]	validation_0-merror:0.28468	validation_1-merror:0.28566
[7]	validation_0-merror:0.28440	validation_1-merror:0.28578
[8]	validation_0-merror:0.28383	validation_1-merror:0.28528
[9]	validation_0-merror:0.28364	validation_1-merror:0.28496
[10]	validation_0-merror:0.28194	validation_1-merror:0.28266
[11]	validation_0-merror:0.28130	validation_1-merror:0.28224
[12]	validation_0-merror:0.28052	validation_1-merror:0.28211
[13]	validation_0-merror:0.27895	validation_1-merror:

[132]	validation_0-merror:0.22788	validation_1-merror:0.24840
[133]	validation_0-merror:0.22764	validation_1-merror:0.24824
[134]	validation_0-merror:0.22747	validation_1-merror:0.24829
[135]	validation_0-merror:0.22725	validation_1-merror:0.24836
[136]	validation_0-merror:0.22716	validation_1-merror:0.24831
[137]	validation_0-merror:0.22702	validation_1-merror:0.24817
[138]	validation_0-merror:0.22691	validation_1-merror:0.24839
[139]	validation_0-merror:0.22674	validation_1-merror:0.24844
[140]	validation_0-merror:0.22654	validation_1-merror:0.24844
[141]	validation_0-merror:0.22615	validation_1-merror:0.24849
[142]	validation_0-merror:0.22602	validation_1-merror:0.24866
[143]	validation_0-merror:0.22581	validation_1-merror:0.24855
[144]	validation_0-merror:0.22564	validation_1-merror:0.24854
[145]	validation_0-merror:0.22550	validation_1-merror:0.24839
[146]	validation_0-merror:0.22517	validation_1-merror:0.24842
[147]	validation_0-merror:0.22492	validation_1-merror:0.24839
[148]	va

[265]	validation_0-merror:0.20536	validation_1-merror:0.24536
[266]	validation_0-merror:0.20531	validation_1-merror:0.24527
[267]	validation_0-merror:0.20516	validation_1-merror:0.24509
[268]	validation_0-merror:0.20505	validation_1-merror:0.24515
[269]	validation_0-merror:0.20498	validation_1-merror:0.24516
[270]	validation_0-merror:0.20484	validation_1-merror:0.24520
[271]	validation_0-merror:0.20472	validation_1-merror:0.24532
[272]	validation_0-merror:0.20466	validation_1-merror:0.24543
[273]	validation_0-merror:0.20459	validation_1-merror:0.24540
[274]	validation_0-merror:0.20443	validation_1-merror:0.24535
[275]	validation_0-merror:0.20431	validation_1-merror:0.24523
[276]	validation_0-merror:0.20405	validation_1-merror:0.24521
[277]	validation_0-merror:0.20390	validation_1-merror:0.24516
[278]	validation_0-merror:0.20384	validation_1-merror:0.24525
[279]	validation_0-merror:0.20371	validation_1-merror:0.24531
[280]	validation_0-merror:0.20353	validation_1-merror:0.24535
[281]	va

[398]	validation_0-merror:0.18750	validation_1-merror:0.24527
[399]	validation_0-merror:0.18738	validation_1-merror:0.24524
[400]	validation_0-merror:0.18721	validation_1-merror:0.24532
[401]	validation_0-merror:0.18711	validation_1-merror:0.24544
[402]	validation_0-merror:0.18704	validation_1-merror:0.24538
[403]	validation_0-merror:0.18691	validation_1-merror:0.24532
[404]	validation_0-merror:0.18679	validation_1-merror:0.24519
[405]	validation_0-merror:0.18661	validation_1-merror:0.24517
[406]	validation_0-merror:0.18648	validation_1-merror:0.24519
[407]	validation_0-merror:0.18635	validation_1-merror:0.24521
[408]	validation_0-merror:0.18628	validation_1-merror:0.24515
[409]	validation_0-merror:0.18615	validation_1-merror:0.24531
[410]	validation_0-merror:0.18603	validation_1-merror:0.24524
[411]	validation_0-merror:0.18594	validation_1-merror:0.24515
[412]	validation_0-merror:0.18578	validation_1-merror:0.24516
[413]	validation_0-merror:0.18564	validation_1-merror:0.24499
[414]	va

[531]	validation_0-merror:0.17268	validation_1-merror:0.24490
[532]	validation_0-merror:0.17255	validation_1-merror:0.24485
[533]	validation_0-merror:0.17244	validation_1-merror:0.24479
[534]	validation_0-merror:0.17241	validation_1-merror:0.24487
[535]	validation_0-merror:0.17235	validation_1-merror:0.24479
[536]	validation_0-merror:0.17223	validation_1-merror:0.24475
[537]	validation_0-merror:0.17212	validation_1-merror:0.24478
[538]	validation_0-merror:0.17199	validation_1-merror:0.24476
[539]	validation_0-merror:0.17190	validation_1-merror:0.24474
[540]	validation_0-merror:0.17181	validation_1-merror:0.24465
[541]	validation_0-merror:0.17179	validation_1-merror:0.24465
[542]	validation_0-merror:0.17165	validation_1-merror:0.24476
[543]	validation_0-merror:0.17154	validation_1-merror:0.24478
[544]	validation_0-merror:0.17139	validation_1-merror:0.24480
[545]	validation_0-merror:0.17131	validation_1-merror:0.24483
[546]	validation_0-merror:0.17128	validation_1-merror:0.24483
[547]	va

[664]	validation_0-merror:0.15930	validation_1-merror:0.24460
[665]	validation_0-merror:0.15909	validation_1-merror:0.24457
[666]	validation_0-merror:0.15900	validation_1-merror:0.24446
[667]	validation_0-merror:0.15882	validation_1-merror:0.24449
[668]	validation_0-merror:0.15870	validation_1-merror:0.24442
[669]	validation_0-merror:0.15855	validation_1-merror:0.24444
[670]	validation_0-merror:0.15844	validation_1-merror:0.24441
[671]	validation_0-merror:0.15837	validation_1-merror:0.24448
[672]	validation_0-merror:0.15827	validation_1-merror:0.24450
[673]	validation_0-merror:0.15818	validation_1-merror:0.24446
[674]	validation_0-merror:0.15810	validation_1-merror:0.24459
[675]	validation_0-merror:0.15802	validation_1-merror:0.24457
[676]	validation_0-merror:0.15795	validation_1-merror:0.24453
[677]	validation_0-merror:0.15788	validation_1-merror:0.24463
[678]	validation_0-merror:0.15783	validation_1-merror:0.24468
[679]	validation_0-merror:0.15773	validation_1-merror:0.24468
Stopping

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.17, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
pred = xgb.predict(X_CV)
print(f1_score(Y_CV, pred, average = 'weighted'))

0.7537626945798784


In [56]:
test_pred = xgb.predict(test.drop(['Unnamed: 0', 'userId'], axis = 1))
test_pred = pd.DataFrame(test_pred, columns = ['prediction'])

In [57]:
test_pred.head()

Unnamed: 0,prediction
0,1
1,1
2,1
3,2
4,1


In [58]:
test_pred.to_csv('Preds/Submission3.csv', index = False)