# Baseline 

Features that do not include in this baseline model: ['os', 'osv', 'lan', 'sid', 'timestamp', 'version']

Score = 86.714

## 1 Import Packages

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## 2 Load Data

In [2]:
# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test1.csv')

# remove the first column 'Unnamed'
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [3]:
# Copy the dataset
# features = train.drop(['label'], axis = 1)
features = train
test_features = test

## 3 Explore the Dataset

In [4]:
# The summary of the dataframe
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 20 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   android_id  500000 non-null  int64  
 1   apptype     500000 non-null  int64  
 2   carrier     500000 non-null  float64
 3   dev_height  500000 non-null  float64
 4   dev_ppi     500000 non-null  float64
 5   dev_width   500000 non-null  float64
 6   label       500000 non-null  int64  
 7   lan         316720 non-null  object 
 8   media_id    500000 non-null  int64  
 9   ntt         500000 non-null  float64
 10  os          500000 non-null  object 
 11  osv         493439 non-null  object 
 12  package     500000 non-null  int64  
 13  sid         500000 non-null  int64  
 14  timestamp   500000 non-null  float64
 15  version     500000 non-null  object 
 16  fea_hash    500000 non-null  object 
 17  location    500000 non-null  int64  
 18  fea1_hash   500000 non-null  int64  
 19  cu

It can be seen that there are several features of type 'object'.

We want to convert these features to numeric types before training the model. 

In [5]:
print(features.select_dtypes(include = 'object').columns)

Index(['lan', 'os', 'osv', 'version', 'fea_hash'], dtype='object')


In [6]:
# Generate statistics
features.describe()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,media_id,ntt,package,sid,timestamp,location,fea1_hash,cus_type
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,261359.275126,986.64011,40028.788034,1264.986626,72.027966,703.486166,0.48448,124.08762,3.089808,38.465876,1500335.0,1559814000000.0,96.040504,2300866000.0,730.824682
std,233616.172774,128.956348,15460.788899,853.37133,167.66493,505.751343,0.49976,164.25454,1.843088,136.321129,288429.2,168073500.0,85.65274,1236593000.0,331.946854
min,0.0,95.0,-1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1000005.0,1559491000000.0,-1.0,12400.0,297.0
25%,0.0,917.0,46000.0,720.0,0.0,360.0,0.0,29.0,2.0,0.0,1250850.0,1559664000000.0,23.0,1376752000.0,411.0
50%,228563.0,1001.0,46000.0,1280.0,0.0,720.0,0.0,64.0,2.0,7.0,1500358.0,1559816000000.0,64.0,2490131000.0,658.0
75%,465701.5,1076.0,46000.0,2040.0,0.0,1080.0,1.0,139.0,5.0,24.0,1750028.0,1559964000000.0,154.0,3062465000.0,1019.0
max,709898.0,1241.0,46003.0,9024.0,720.0,8832.0,1.0,1544.0,7.0,2327.0,1999999.0,1560096000000.0,330.0,4291920000.0,1380.0


In [7]:
# Check if there is missing values
features.isnull().sum()

android_id         0
apptype            0
carrier            0
dev_height         0
dev_ppi            0
dev_width          0
label              0
lan           183280
media_id           0
ntt                0
os                 0
osv             6561
package            0
sid                0
timestamp          0
version            0
fea_hash           0
location           0
fea1_hash          0
cus_type           0
dtype: int64

In [8]:
# have a look at the number of unique values in each feature
for f in features.columns:
    print(f, features[f].nunique())

android_id 362258
apptype 89
carrier 5
dev_height 798
dev_ppi 92
dev_width 346
label 2
lan 21
media_id 284
ntt 8
os 2
osv 154
package 1950
sid 500000
timestamp 500000
version 22
fea_hash 402980
location 332
fea1_hash 4959
cus_type 58


In [9]:
# check the features with small number of unique values
features['carrier'].value_counts()

 46000.0    359409
 46001.0     43390
 0.0         40652
 46003.0     32294
-1.0         24255
Name: carrier, dtype: int64

In [10]:
# check the features with small number of unique values
features['ntt'].value_counts()

2.0    318597
6.0    116548
5.0     35021
0.0     20617
3.0      4463
4.0      4446
7.0       306
1.0         2
Name: ntt, dtype: int64

In [11]:
# check the features with small number of unique values
features['os'].value_counts()

android    303175
Android    196825
Name: os, dtype: int64

## 4 Data Processing

### 4.1 'sid'

Feature 'sid' is the user id, which is meaningless in classification, so we have to remove 'sid' in training set and test set.

In [12]:
features = features.drop(['sid'], axis = 1)
test_features = test_features.drop(['sid'], axis = 1)

### 4.2 'os'

In [13]:
# counts of unique values in 'os'
features['os'].value_counts()

android    303175
Android    196825
Name: os, dtype: int64

It can be seen that the unique values in 'os' are 'android' and 'Android', which are the same, and it is meaningless in classification, so we have to remove 'os' in training set and test set.

In [14]:
features = features.drop(['os'], axis = 1)
test_features = test_features.drop(['os'], axis = 1)

### 4.3 'fea_hash'
'fea_hash' is the User characteristic code (specific physical meaning omitted) and need to be encoded to something useful.

In [15]:
# unique values in 'fea_hash'
features['fea_hash'].value_counts()

68083895      110
235856055      99
51306679       91
2815114810     77
16777343       60
             ... 
3992223082      1
2689929508      1
809700314       1
3429811483      1
139485476       1
Name: fea_hash, Length: 402980, dtype: int64

In [16]:
# statistics
features['fea_hash'].describe()

count       500000
unique      402980
top       68083895
freq           110
Name: fea_hash, dtype: object

In [17]:
# we can encode 'fea_hash' using the lengths of its elements
# create a new 'fea_hash_len'
features['fea_hash_len'] = features['fea_hash'].map(lambda x: len(str(x)))
test_features['fea_hash_len'] = test_features['fea_hash'].map(lambda x: len(str(x)))

In [18]:
features['fea_hash_len'].value_counts()

10    378925
9     108904
8      11235
7        740
6         93
38        37
39        28
37        16
5         11
36         3
1          2
32         2
33         2
30         1
31         1
Name: fea_hash_len, dtype: int64

In [19]:
# clean 'fea_hash' by removing abnormal values (with low frequencise)
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_features['fea_hash'] = test_features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))

### 4.4 'fea1_hash'

Similar to 'fea_hash', 'fea1_hash' is also the User characteristic code (specific physical meaning omitted) and need to be encoded to something useful. 

In [20]:
# we can encode 'fea1_hash' using the lengths of its elements
# create a new 'fea1_hash_len'
features['fea1_hash_len'] = features['fea1_hash'].map(lambda x: len(str(x)))
test_features['fea1_hash_len'] = test_features['fea1_hash'].map(lambda x: len(str(x)))

In [21]:
features['fea1_hash_len'].value_counts()

10    391669
9      99347
8       8977
7          6
5          1
Name: fea1_hash_len, dtype: int64

In [22]:
# clean 'fea_hash' by removing abnormal values (with low frequencise)
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) < 8 else int(x))
test_features['fea1_hash'] = test_features['fea1_hash'].map(lambda x: 0 if len(str(x)) < 8 else int(x))

## 5 Training

In [23]:
# use LGBM to train
import lightgbm as lgb

model = lgb.LGBMClassifier()
# model training
model.fit(features.drop(['osv', 'lan', 'timestamp', 'version', 'label'], axis = 1), train['label'])
result = model.predict(test_features.drop(['osv', 'lan', 'timestamp', 'version'], axis = 1))
result

array([0, 1, 0, ..., 1, 1, 1])

In [24]:
# save result
res = pd.DataFrame(test['sid'])
res['label'] = result

In [25]:
# save results to excel file
res.to_csv('baseline.csv', index = False)
res

Unnamed: 0,sid,label
0,1440682,0
1,1606824,1
2,1774642,0
3,1742535,0
4,1689686,1
...,...,...
149995,1165373,1
149996,1444115,1
149997,1134378,1
149998,1700238,1
