In [26]:
# Import the required libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
#Read the data from source and  know it's basic structure
train_df = pd.read_csv('./train.csv')
test_df=pd.read_csv('./test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(4209, 378)
(4209, 377)


In [4]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
#exploring the types of features containing in train and test data sets
col_train= [a for a in train_df.columns if 'X' in a]
print('Number of features: {} in train set'.format(len(col_train)))
print('Feature types:')
train_df[col_train].dtypes.value_counts()

Number of features: 376 in train set
Feature types:


int64     368
object      8
dtype: int64

In [7]:
col_test= [a for a in test_df.columns if 'X' in a]
print('Number of features: {} in train set'.format(len(col_test)))
print('Feature types:')
train_df[col_test].dtypes.value_counts()

Number of features: 376 in train set
Feature types:


int64     368
object      8
dtype: int64

In [8]:
def info_features(df,col):
    features_test_list = [[], [], []]
    for a in col:
        k = df[a].dtype
        i = len(np.unique(df[a]))
        if i== 1:
            features_test_list[0].append(a)
        elif i == 2 and k == np.int64:
            features_test_list[1].append(a)
        else:
            features_test_list[2].append(a)
    print('Constant features: {}   Binary features: {}   Categorical features: {}\n'
    .format(*[len(a) for a in features_test_list]))
    print('Constant features:', features_test_list[0])
    print('Categorical features:', features_test_list[2])

In [9]:
info_features(train_df,col_train)

Constant features: 12   Binary features: 356   Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [10]:
train_df['X0'].value_counts().count()

47

In [11]:
info_features(test_df,col_test)

Constant features: 5   Binary features: 363   Categorical features: 8

Constant features: ['X257', 'X258', 'X295', 'X296', 'X369']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


# 1. If for any column(s), the variance is equal to zero, then to remove those variable(s).
# 3. To apply label encoder.

In [12]:
def drop_var0_columns(df,col):
    z=0
    for v in col:
        k = df[v].dtype
        i = len(np.unique(df[v]))
        if i== 1:
            df.drop(v, axis=1,inplace=True)
            z=z+1
        elif i>2 and k != np.int64:
                mapper = lambda x:sum([ord(digit) for digit in x])
                df[v] = df[v].apply(mapper)
    print ("no.of columns removed={}".format(z))

In [13]:
drop_var0_columns(train_df,col_train)

no.of columns removed=12


In [14]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,107,118,213,97,100,117,106,111,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,107,116,215,101,100,121,108,111,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,219,119,110,99,100,120,106,120,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,219,116,110,102,100,120,108,101,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,219,118,110,102,100,104,100,110,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_df.dtypes.value_counts()

int64      365
float64      1
dtype: int64

In [16]:
drop_var0_columns(test_df,col_test)

no.of columns removed=5


test_df.head()

# 2.To Check for null and unique values for test and train sets.

In [17]:
def check_missing_values(df):
    if df.isnull().any().any():
        print("There are missing values in the dataframe")
    else:
        print("There are no missing values in the dataframe")

In [18]:
check_missing_values(train_df)

There are no missing values in the dataframe


In [19]:
check_missing_values(test_df)

There are no missing values in the dataframe


# 4.To Perform dimensionality reduction.


In [20]:
common_columns=list(train_df.columns & test_df.columns)
x_train = train_df[common_columns]
x_test = test_df[common_columns]
y_train=train_df['y']
x_train.head(2)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,107,118,213,97,100,117,106,111,0,...,0,0,1,0,0,0,0,0,0,0
1,6,107,116,215,101,100,121,108,111,0,...,1,0,0,0,0,0,0,0,0,0


In [21]:
x_test.head(2)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,219,118,110,102,100,116,97,119,0,...,0,0,0,1,0,0,0,0,0,0
1,2,116,98,202,97,100,98,103,121,0,...,0,0,1,0,0,0,0,0,0,0


In [22]:
n_comp = 12
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)

# 5.To predict your test_df values using XGBoost.
