In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
#load training data
train_df = pd.read_csv('data/train.csv')

#load feature data
features_df = pd.read_csv('data/features.csv')

### General overview of the data

In [3]:
train_df.head(5)

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [4]:
features_df.head(5)

Unnamed: 0,feature,tag_0,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,...,tag_19,tag_20,tag_21,tag_22,tag_23,tag_24,tag_25,tag_26,tag_27,tag_28
0,feature_0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,feature_1,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,feature_2,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,feature_3,False,False,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
4,feature_4,False,False,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float64(135), int64(3)
memory usage: 2.5 GB


In [6]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 30 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   feature  130 non-null    object
 1   tag_0    130 non-null    bool  
 2   tag_1    130 non-null    bool  
 3   tag_2    130 non-null    bool  
 4   tag_3    130 non-null    bool  
 5   tag_4    130 non-null    bool  
 6   tag_5    130 non-null    bool  
 7   tag_6    130 non-null    bool  
 8   tag_7    130 non-null    bool  
 9   tag_8    130 non-null    bool  
 10  tag_9    130 non-null    bool  
 11  tag_10   130 non-null    bool  
 12  tag_11   130 non-null    bool  
 13  tag_12   130 non-null    bool  
 14  tag_13   130 non-null    bool  
 15  tag_14   130 non-null    bool  
 16  tag_15   130 non-null    bool  
 17  tag_16   130 non-null    bool  
 18  tag_17   130 non-null    bool  
 19  tag_18   130 non-null    bool  
 20  tag_19   130 non-null    bool  
 21  tag_20   130 non-null    bool  
 22  ta

In [7]:
# print the shape of train and features data
print('shape of train data: ', train_df.shape)
print('shape of feature_hot data: ', features_hot.shape)
print('brief description of the data:\n')
train_df.describe().transpose()

shape of train data:  (2390491, 138)


NameError: name 'features_hot' is not defined

In [None]:
#percentage of missing value in train_df per column
train_columns = train_df.columns.values
percentage = []
for column in train_columns:
    total_values = train_df['feature_0'].values.shape[0]
    nan_values = train_df[column].isnull().sum()
    percentage.append((column,nan_values/total_values * 100))

#dataframe, presenting the percentage of missing values
labels = []
values = []
for label,value in percentage:
    labels.append(label)
    values.append(value)
missing_df = pd.DataFrame({'missing%':values},labels).sort_values(by='missing%',ascending=True)

In [None]:
missing_df.tail(5)

In [None]:
#plotting columns with missing percentage above 2.5%
missing_df.loc[missing_df['missing%'] >=2.5].plot.bar(figsize=(10,8))
plt.show()

In [None]:
missing_df.loc[(missing_df['missing%'] <=2.5) & (missing_df['missing%'] != 0)].plot.bar(figsize=(10,8))
plt.show()

In [None]:
#Label encode features (True---> 1 and Flase---> 0)
feature_column = features_df[['feature']]
features_hot = features_df.apply(LabelEncoder().fit_transform)
features_hot.drop(columns=['feature'],inplace=True)
features_hot = pd.concat([feature_column,features_hot],axis=1)
features_hot.tail(5)

### Distribution plot of all the columns in train data