# Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Library Definition

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Preprocess

In [3]:
X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_exam.csv"

X_exam = pd.read_csv(X_exam_file_path)

In [4]:
X_exam.head()

Unnamed: 0,gender,age_code,region_code,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,...,t20220817,t20220818,t20220819,t20220820,t20220821,t20220822,t20220823,t20220824,t20220825,t20220826
0,2,4,4,,,,,,,,...,,,,,,,,,,
1,2,10,15,,,,,,,,...,,,,,,,,,,
2,2,4,2,,,,,,,,...,,,,,,,,,,
3,1,8,1,,,,,,,,...,,,,,,,,,,
4,2,8,2,,,,,,,,...,,,,,,,,,,


## Missing Value Handling

In [5]:
X_exam.info(verbose = True, null_counts = True)

  X_exam.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 717 columns):
 #    Column       Non-Null Count   Dtype  
---   ------       --------------   -----  
 0    gender       200000 non-null  int64  
 1    age_code     200000 non-null  int64  
 2    region_code  200000 non-null  int64  
 3    c20220101    23715 non-null   float64
 4    c20220102    23122 non-null   float64
 5    c20220103    36589 non-null   float64
 6    c20220104    32773 non-null   float64
 7    c20220105    36020 non-null   float64
 8    c20220106    31705 non-null   float64
 9    c20220107    31065 non-null   float64
 10   c20220108    22177 non-null   float64
 11   c20220109    21822 non-null   float64
 12   c20220110    39134 non-null   float64
 13   c20220111    33653 non-null   float64
 14   c20220112    32429 non-null   float64
 15   c20220113    31769 non-null   float64
 16   c20220114    33327 non-null   float64
 17   c20220115    23499 non-null   float64
 18   c2

In [6]:
X_exam = X_exam.fillna(0)

assert(X_exam.isnull().sum().sum() == 0)

# Calculate login probability in 5 days

In [7]:
X_exam = X_exam.drop(['gender', 'age_code', 'region_code'], axis = 1)
X_exam

Unnamed: 0,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,c20220108,c20220109,c20220110,...,t20220817,t20220818,t20220819,t20220820,t20220821,t20220822,t20220823,t20220824,t20220825,t20220826
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
199997,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
columns_login_num = [c for c in X_exam.columns if c.find('c') >= 0]
X_exam = X_exam[columns_login_num]
X_exam

Unnamed: 0,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,c20220108,c20220109,c20220110,...,c20220817,c20220818,c20220819,c20220820,c20220821,c20220822,c20220823,c20220824,c20220825,c20220826
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
199997,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
199998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
X_exam = X_exam.astype('bool').astype('int')
X_exam

Unnamed: 0,c20220101,c20220102,c20220103,c20220104,c20220105,c20220106,c20220107,c20220108,c20220109,c20220110,...,c20220817,c20220818,c20220819,c20220820,c20220821,c20220822,c20220823,c20220824,c20220825,c20220826
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199996,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,1
199997,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
199998,0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0


In [10]:
X_exam['login_proba'] = X_exam.sum(axis = 1)/X_exam.shape[1]
X_exam['login_proba']

0         0.058824
1         0.008403
2         0.130252
3         0.016807
4         0.004202
            ...   
199995    0.067227
199996    0.231092
199997    0.138655
199998    0.075630
199999    0.029412
Name: login_proba, Length: 200000, dtype: float64

In [11]:
X_exam['login_proba_in_5days'] = 1 - pow(1-X_exam['login_proba'], 5)
X_exam_login_proba_in_5days = pd.DataFrame(X_exam['login_proba_in_5days'].values, columns = ['login_proba_in_5days'])
X_exam_login_proba_in_5days

Unnamed: 0,login_proba_in_5days
0,0.261492
1,0.041317
2,0.502301
3,0.081256
4,0.020833
...,...
199995,0.293877
199996,0.731236
199997,0.525884
199998,0.325117


In [12]:
X_exam_login_proba_in_5days.to_csv("/content/drive/Shareddrives/Intro-data-science/data/X_exam_login_proba_in_5days.csv")