In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [19]:
train= pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/train.csv')
test= pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv')

In [None]:
train.head()

In [30]:
float_cols= test.select_dtypes(include=['float','int']).columns

In [4]:
cat_cols= train.select_dtypes(include=['object']).columns
cat_cols

Index(['product_code', 'attribute_0', 'attribute_1'], dtype='object')

**Feature Engineering**

In [None]:
train.isnull().sum()

Lots of missing values. There are may ways we could tackle this. https://www.kaggle.com/code/azminetoushikwasi/ml-foundation-imputation-all-techniques

In [None]:
null_cols = train.columns[train.isnull().any()]

In [None]:
null_cols

In [None]:
null= train[null_cols].select_dtypes(include=['float','int'])
null

In [None]:
plt.subplots(figsize=(25,35))
for i, column in enumerate(null.columns):
    plt.subplot(6,4,i+1)
    sns.histplot(data=null, x=column)
    plt.title(column)

All of the variables with missing values appear to be normally distributed. In this case, the simplest way would be to replace the missing values with the mean. 

In [15]:
from sklearn.model_selection import GroupKFold

In [16]:
kf = GroupKFold(n_splits=5) 
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, train.failure, train.product_code)):
    print(fold,(idx_tr, idx_va))

0 (array([    0,     1,     2, ..., 26567, 26568, 26569]), array([10350, 10351, 10352, ..., 16112, 16113, 16114]))
1 (array([    0,     1,     2, ..., 21224, 21225, 21226]), array([21227, 21228, 21229, ..., 26567, 26568, 26569]))
2 (array([    0,     1,     2, ..., 26567, 26568, 26569]), array([ 5100,  5101,  5102, ..., 10347, 10348, 10349]))
3 (array([    0,     1,     2, ..., 26567, 26568, 26569]), array([16115, 16116, 16117, ..., 21224, 21225, 21226]))
4 (array([ 5100,  5101,  5102, ..., 26567, 26568, 26569]), array([   0,    1,    2, ..., 5097, 5098, 5099]))


In [20]:
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, train.failure, train.product_code)):
    X_train = train.iloc[idx_tr][test.columns]
    X_valid = train.iloc[idx_va][test.columns]
    y_train = train.iloc[idx_tr].failure
    y_valid = train.iloc[idx_va].failure
    
for df in [X_train, X_valid, test]:
    pd.get_dummies(data=df, columns=['attribute_0', 'attribute_1'], drop_first=True)

In [32]:
imputer= SimpleImputer(strategy='mean')
imputer= imputer.fit(train[float_cols])
for df in [X_train, X_valid, test]:
    df[float_cols]= imputer.transform(df[float_cols])

In [33]:
X_train

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
5100,5100.0,B,81.46,material_5,material_5,8.0,8.0,12.0,12.0,4.0,...,18.179,12.261000,14.961,18.640,11.267,16.658,15.403,14.786,17.417000,761.784
5101,5101.0,B,92.59,material_5,material_5,8.0,8.0,7.0,3.0,8.0,...,19.201,11.845000,18.434,21.331,12.767,14.080,17.255,15.344,16.296000,569.667
5102,5102.0,B,179.82,material_5,material_5,8.0,8.0,5.0,11.0,10.0,...,19.649,12.171000,17.084,18.645,10.618,18.361,16.001,18.012,18.445000,589.286
5103,5103.0,B,129.10,material_5,material_5,8.0,8.0,3.0,3.0,9.0,...,18.625,10.523000,17.849,19.328,10.738,18.896,17.661,15.544,16.645000,646.248
5104,5104.0,B,130.43,material_5,material_5,8.0,8.0,13.0,14.0,17.0,...,19.822,12.318000,16.095,22.801,11.450,15.376,17.048,14.007,16.460727,638.087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,26565.0,E,158.95,material_7,material_6,6.0,9.0,6.0,16.0,4.0,...,19.354,11.430725,12.177,17.942,10.112,15.795,18.572,16.144,16.460727,729.131
26566,26566.0,E,146.02,material_7,material_6,6.0,9.0,10.0,12.0,8.0,...,19.563,11.242000,14.179,20.564,10.234,14.450,14.322,13.146,16.471000,853.924
26567,26567.0,E,115.62,material_7,material_6,6.0,9.0,1.0,10.0,1.0,...,19.279,11.407000,16.437,17.476,8.668,15.069,16.599,15.590,14.065000,750.364
26568,26568.0,E,106.38,material_7,material_6,6.0,9.0,2.0,9.0,4.0,...,19.358,11.392000,17.064,17.814,14.928,16.273,15.485,13.624,12.865000,730.156


In [35]:
for df in [X_train, X_valid, test]:
    df.drop('id',axis=1) 

In [45]:
features = [f for f in X_train.columns if f != 'product_code']

In [38]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
val_scores=[]
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train[features], y_train)
val_scores.append(clf.score(X_valid[features2], y_valid))

sorted_idx = clf.feature_importances_.argsort()
plt.barh(X_train.feature_names[sorted_idx], clf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

ValueError: could not convert string to float: 'material_5'