In [None]:
import random
import warnings

import numpy as np

seed = 42
np.random.seed(seed)
random.seed(seed)
import matplotlib.pyplot as plt
import missingno as mn
import pandas as pd
import seaborn as sns
from autoxgb import AutoXGB
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
ss.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
mn.matrix(train)

In [None]:
mn.matrix(test)

In [None]:
train = train.drop('ID',axis=1)
test = test.drop('ID',axis=1)

In [None]:
train.loc[:, 'Sensor1_PM2.5':'Offset_fault'].describe().T.style.bar(subset=['mean'], color='#206ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
### Now let's visualize 
sns.countplot(train.Offset_fault)

In [None]:
# Extract day, month year and hour from the Datetime column
# day
def converte_dates(df):
    
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    
    #
    df['Datetime_day'] = df.Datetime.dt.day

    # month
    df['Datetime_month'] = df.Datetime.dt.month

    # year
    df['Datetime_year'] = df.Datetime.dt.year

    # hour
    df['Datetime_hour'] = df.Datetime.dt.hour
    
    # minute
    df['Datetime_minute'] = df.Datetime.dt.minute
    
    # day of week
    df['Datetime_dayofweek'] = df.Datetime.dt.weekday
    
    return df


train = converte_dates(train)
test = converte_dates(test)

In [None]:
train.head()

In [None]:
train.shape,test.shape

In [None]:
test.head()

In [None]:
train = train.drop('Datetime',axis=1)
test = test.drop('Datetime',axis=1)

In [None]:
train.head()

In [None]:
X = train.drop('Offset_fault',axis=1)
y = train['Offset_fault']

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test = pd.DataFrame(columns=test.columns, data=pipeline.fit_transform(test))

In [None]:
X.head()

In [None]:
test.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, test_size=0.05,stratify=y)

In [None]:
# rf = RandomForestClassifier()
# rf.fit(X_train,y_train)
# pred = rf.predict(X_test)
# accuracy_score(y_test, pred)

In [None]:
pred_rf = rf.predict(test)

sub = pd.DataFrame({'ID':ss.ID,'Offset_fault':pred_rf})
sub.to_csv('rfc.csv',index=False)

In [None]:
cb = CatBoostClassifier(verbose=0)
cb.fit(X_train,y_train)
pred = cb.predict(X_test)

accuracy_score(y_test, pred)

In [None]:
pred_cb = cb.predict(test)

sub = pd.DataFrame({'ID':ss.ID,'Offset_fault':pred_cb})
sub.to_csv('cb.csv',index=False)

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
pred = xgb.predict(X_test)

print(f"Training Accuracy: {xgb.score(X_train, y_train):0.2f}")
print(f"Test Accuracy: {xgb.score(X_test, y_test):0.2f}")

In [None]:
pred_xgb = xgb.predict(test)

sub = pd.DataFrame({'ID':ss.ID,'Offset_fault':pred_xgb})
sub.to_csv('xgbc.csv',index=False)