In [None]:
import random
import warnings

import matplotlib.pyplot as plt
import missingno as mn
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

plt.style.use('ggplot')


random_state= 42
np.random.seed(random_state)
random.seed(random_state)
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#ss = pd.read_csv('SampleSubmission.csv')
ss = test.ID

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# ss.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
mn.matrix(train)

In [None]:
mn.matrix(test)

In [None]:
train = train.drop('ID',axis=1)
test = test.drop('ID',axis=1)

In [None]:
train.loc[:, 'Sensor1_PM2.5':'Offset_fault'].describe().T.style.bar(subset=['mean'], color='#206ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
### Now let's visualize 
sns.countplot(train.Offset_fault)

In [None]:
# Extract day, month year and hour from the Datetime column
# day
def converte_dates(df):
    
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    
    #
    df['Datetime_day'] = df.Datetime.dt.day

    # month
    df['Datetime_month'] = df.Datetime.dt.month

    # year
    df['Datetime_year'] = df.Datetime.dt.year

    # hour
    df['Datetime_hour'] = df.Datetime.dt.hour
    
    # minute
    df['Datetime_minute'] = df.Datetime.dt.minute
    
    # day of week
    df['Datetime_dayofweek'] = df.Datetime.dt.weekday
    
    
    return df


train = converte_dates(train)
test = converte_dates(test)

In [None]:
def more_features(df):
    df['is_morning'] = (6 <= df['Datetime_hour']) & (df['Datetime_hour'] < 12)#.median()
    df['is_afternoon'] = (12 <= df['Datetime_hour']) & (df['Datetime_hour'] < 18)#.median()
    df['is_evening'] = (18 <= df['Datetime_hour']) & (df['Datetime_hour'] <= 23)#.median()
    df['is_night'] = (0 <= df['Datetime_hour']) & (df['Datetime_hour'] < 6)#.median()
    return df


train = more_features(train)
test = more_features(test)

In [None]:
corr = train.corr()
corr.style.background_gradient()

In [None]:
plt.rcParams['figure.figsize']=(15,8)
train.boxplot(column=['Sensor1_PM2.5',
 'Sensor2_PM2.5',
 'Temperature',
 'Relative_Humidity'])
plt.show()

From the  above plot we can see that our features **Sensor1_PM2.5 and Sensor2_MP2.5** is show a huge outliers so for the first time I try dropping them then we might loss so many data I decide to keep it as it and I will scale the features later on.

In [None]:
train.shape,test.shape

In [None]:
train = train.drop('Datetime',axis=1)
test = test.drop('Datetime',axis=1)

In [None]:
# feature_names = train.drop(['Offset_fault','Datetime_month','Datetime_year'], axis=1).columns
# for i in range(len(feature_names)-1):
#     figure = plt.figure()
#     ax = sns.boxplot(x='Offset_fault', y=feature_names[i], data=train)

In [None]:
train.shape

In [None]:
X = train.drop('Offset_fault',axis=1)
y = train['Offset_fault']



Here I am capturing NaN per row and making new feature.<br>
I am doing this because sometime missing carry signal so we only give it to the model.


In [None]:
def feature_engineering(df):
    df['NaN_row'] = df.isna().sum(axis=1)
    df['std'] = df.std(axis=1)
    return df

X = feature_engineering(X)
test = feature_engineering(test)

# added code

In [None]:
def feature_engineering(df):
    #df['AverageTemperature'] = df['Temperature'].mean(axis=0)
    df['AverageTemperature'] = df.groupby('Relative_Humidity')['Temperature'].transform('mean')
    df['AverageHumidity'] = df.groupby('Temperature')['Relative_Humidity'].transform('mean')
    # df['Relative_Humidity'] = df['Relative_Humidity'].mean(axis=0)
    df['Total_sensor'] = df['Sensor1_PM2.5'] + df['Sensor2_PM2.5']
    
    return df

X = feature_engineering(X)
test = feature_engineering(test)

X.head()

In [None]:
X.shape

### Pipeline for data transformation 

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test = pd.DataFrame(columns=test.columns, data=pipeline.transform(test))

In [None]:
plt.figure(figsize=(24,10))
sns.heatmap(train.corr(), cmap='YlGnBu', vmax=1.0, vmin=-1.0, annot = True, annot_kws={"size": 15})
plt.title('Correlation between numeric features')
plt.show()

 Our Sensor1 and Sensor2 is showing highly correlation we are supposed to drop one but I did combine them to find the total sensor so for now I am going to keep them.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state= 42, test_size=0.02)

In [None]:
# # lgb_params = {
#     "objective" : "binary",
#     "metric" : "accuracy",
#     "boosting": 'gbdt',
#     # #"max_depth" : -1,
#     # "num_leaves" : 13,
#     # "learning_rate" : 0.01,
#     # "bagging_freq": 5,
#     # "bagging_fraction" : 0.4,
#     # "feature_fraction" : 0.05,
#     # "min_data_in_leaf": 80,
#     # "min_sum_heassian_in_leaf": 10,
#     # "tree_learner": "serial",
#     # "boost_from_average": "false",
#     #"lambda_l1" : 5,
#     #"lambda_l2" : 5,
#     "bagging_seed" : random_state,
#     "verbosity" : 1,
#     "seed": random_state
# }



params = {'n_estimators': 1040, 'max_depth': 3, 'reg_lambda': 0.16661201237472856, 
          'colsample_bytree': 0.9064439932687255, 'num_leaves': 450, 
          'min_child_samples': 21, 'subsample': 0.5564713817638391, 'random_state':0}
lgb= LGBMClassifier(**params)




# lgb = LGBMClassifier(
#     objective='binary',learning_rate=0.1, class_weight='balanced',
#     boosting_type ='gbdt',
#     bagging_seed = random_state,
#     importance_type='gain',
#     metric = "accuracy",
# )
                    
lgb.fit(X_train,y_train)

In [None]:
y_pred = lgb.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(f"Training Accuracy: {lgb.score(X_train, y_train):0.3f}")
print(f"Test Accuracy: {lgb.score(X_test, y_test):0.3f}")

In [None]:
pred_lgb = lgb.predict(test)
sub = pd.DataFrame({'ID':ss,'Offset_fault':pred_lgb})
sub.to_csv('lgb.csv',index=False)

In [None]:
lgb = LGBMClassifier()
lgb.fit(X_train.values, y_train)
y_pred = lgb.predict(X_test)
print(f'Accuracy score on the X_test is: {accuracy_score(y_test, y_pred)}')

In [None]:
# Feature importance
impo_df = pd.DataFrame({'feature': X.columns, 'importance': lgb.feature_importances_}).set_index('feature').sort_values(by = 'importance', ascending = False)
impo_df = impo_df[:13].sort_values(by = 'importance', ascending = True)
impo_df.plot(kind = 'barh', figsize = (10, 10), color = 'purple')
plt.legend(loc = 'center right')
plt.title('Bar chart showing feature importance', color = 'indigo', fontsize = 14)
plt.xlabel('Features', fontsize = 12, color = 'indigo')
plt.show()

In [None]:
X.columns.to_list()

In [None]:
to_drop = ['NaN_row','Datetime_year']
test_df = test.drop(to_drop,axis=1)

Now let's try to use stratify so make sure data label is well distributed

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X.drop(to_drop,axis=1), y, random_state=42, test_size=0.05,stratify=y)

In [None]:
lgb = LGBMClassifier(objective='binary',learning_rate=0.1,class_weight='balanced')
lgb.fit(X_train.values, y_train)
y_pred = lgb.predict(X_test)
print(f'Accuracy score on the X_test is: {accuracy_score(y_test, y_pred)}')

In [None]:
pred_lgb = lgb.predict(test_df)
sub = pd.DataFrame({'ID':ss,'Offset_fault':pred_lgb})
sub.to_csv('lgb_model.csv',index=False)

# trying other things