
# BuildPredMain_Classification

## Purpose
This notebook is designed to build and evaluate a classification model using machine learning techniques. The primary objectives are:
1. Load and preprocess the dataset.
2. Explore and visualize the data.
3. Train various classification models.
4. Evaluate the performance of the models.
5. Select the best-performing model for deployment.

## Steps
1. **Data Loading**: Import the dataset and perform initial data inspection.
2. **Data Preprocessing**: Handle missing values, encode categorical variables, and scale numerical features.
3. **Exploratory Data Analysis (EDA)**: Visualize the data to understand distributions and relationships.
4. **Model Training**: Train multiple classification algorithms (e.g., Logistic Regression, Decision Trees, Random Forest).
5. **Model Evaluation**: Assess model performance using metrics such as accuracy, precision, recall, and F1-score.
6. **Model Selection**: Choose the best model based on evaluation metrics and cross-validation results.
7. **Conclusion**: Summarize findings and potential next steps.

## Requirements
- Python 3.x
- Jupyter Notebook
- Libraries: pandas, numpy, matplotlib, seaborn, scikit-learn

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",100)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def algo_test_classification(x, y):
    # Bütün modelleri tanımlıyorum
    log_reg = LogisticRegression()
    knn = KNeighborsClassifier()
    rf = RandomForestClassifier()
    ada = AdaBoostClassifier()
    gbdt = GradientBoostingClassifier()
    svc = SVC()
    nb = GaussianNB()
    dt = DecisionTreeClassifier()
    mlp = MLPClassifier()

    algos = [log_reg, knn, rf, ada, gbdt, svc, nb, dt, mlp]
    algo_names = ['Logistic Regression', 'KNN', 'Random Forest', 'AdaBoost', 'Gradient Boosting', 
                   'SVC', 'Naive Bayes', 'Decision Tree', 'MLP']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
    
    accuracy = []
    precision = []
    recall = []
    f1 = []

    # Hata ve doğruluk oranlarını bir tablo haline getirmek için bir dataframe oluşturuyorum
    result = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'], index=algo_names)
    
    for algo in algos:
        p = algo.fit(x_train, y_train).predict(x_test)
        accuracy.append(accuracy_score(y_test, p))
        precision.append(precision_score(y_test, p, average='weighted'))
        recall.append(recall_score(y_test, p, average='weighted'))
        f1.append(f1_score(y_test, p, average='weighted'))
        
    # result adlı tabloya doğruluk ve hata oranlarımı yerleştiriyorum
    result.Accuracy = accuracy
    result.Precision = precision
    result.Recall = recall
    result.F1_Score = f1
    
    # Oluşturduğum result tablosunu doğruluk oranına (accuracy) göre sıralayıp dönüyor
    rtable = result.sort_values('Accuracy', ascending=False)
    return rtable


In [2]:
df = pd.read_csv("failure.csv")
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [3]:
df.shape

(124494, 12)

In [4]:
df['failure'].value_counts()

failure
0    124388
1       106
Name: count, dtype: int64

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
failure,124494.0,0.0008514467,0.02916725,0.0,0.0,0.0,0.0,1.0
attribute1,124494.0,122386800.0,70459600.0,0.0,61276754.0,122795744.0,183308370.0,244140480.0
attribute2,124494.0,159.4848,2179.658,0.0,0.0,0.0,0.0,64968.0
attribute3,124494.0,9.940455,185.7473,0.0,0.0,0.0,0.0,24929.0
attribute4,124494.0,1.74112,22.90851,0.0,0.0,0.0,0.0,1666.0
attribute5,124494.0,14.22269,15.94302,1.0,8.0,10.0,12.0,98.0
attribute6,124494.0,260172.9,99151.01,8.0,221452.0,249799.5,310266.0,689161.0
attribute7,124494.0,0.2925282,7.436924,0.0,0.0,0.0,0.0,832.0
attribute8,124494.0,0.2925282,7.436924,0.0,0.0,0.0,0.0,832.0
attribute9,124494.0,12.45152,191.4256,0.0,0.0,0.0,0.0,18701.0


In [6]:
df.isnull().sum()

date          0
device        0
failure       0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute8    0
attribute9    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        124494 non-null  object
 1   device      124494 non-null  object
 2   failure     124494 non-null  int64 
 3   attribute1  124494 non-null  int64 
 4   attribute2  124494 non-null  int64 
 5   attribute3  124494 non-null  int64 
 6   attribute4  124494 non-null  int64 
 7   attribute5  124494 non-null  int64 
 8   attribute6  124494 non-null  int64 
 9   attribute7  124494 non-null  int64 
 10  attribute8  124494 non-null  int64 
 11  attribute9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB


In [8]:
# df_date_device = df[['date', 'device']]
# df.drop(['date', 'device'], axis=1, inplace=True)

### we don't want to drop date and device columns

In [9]:
# first 3 byte of device is enough to identify a device
df['device'] = df['device'].apply(lambda x: x[:3])
df.device.value_counts()

device
S1F    54858
W1F    43268
Z1F    26368
Name: count, dtype: int64

In [10]:
#date (day/month/year) 
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.drop('date', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,year,month,day
0,S1F,0,215630672,56,0,52,6,407438,0,0,7,2015,1,1
1,S1F,0,61370680,0,3,0,6,403174,0,0,0,2015,1,1
2,S1F,0,173295968,0,0,0,12,237394,0,0,0,2015,1,1
3,S1F,0,79694024,0,0,0,6,410186,0,0,0,2015,1,1
4,S1F,0,135970480,0,0,0,15,313173,0,0,3,2015,1,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   device      124494 non-null  object
 1   failure     124494 non-null  int64 
 2   attribute1  124494 non-null  int64 
 3   attribute2  124494 non-null  int64 
 4   attribute3  124494 non-null  int64 
 5   attribute4  124494 non-null  int64 
 6   attribute5  124494 non-null  int64 
 7   attribute6  124494 non-null  int64 
 8   attribute7  124494 non-null  int64 
 9   attribute8  124494 non-null  int64 
 10  attribute9  124494 non-null  int64 
 11  year        124494 non-null  int32 
 12  month       124494 non-null  int32 
 13  day         124494 non-null  int32 
dtypes: int32(3), int64(10), object(1)
memory usage: 11.9+ MB


### Train the set


In [13]:
X = df.drop('failure', axis=1)
y = df['failure']

In [17]:
X = pd.get_dummies(X, columns=['device'], drop_first=True)
X.head()

Unnamed: 0,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,year,month,day,device_W1F,device_Z1F
0,215630672,56,0,52,6,407438,0,0,7,2015,1,1,False,False
1,61370680,0,3,0,6,403174,0,0,0,2015,1,1,False,False
2,173295968,0,0,0,12,237394,0,0,0,2015,1,1,False,False
3,79694024,0,0,0,6,410186,0,0,0,2015,1,1,False,False
4,135970480,0,0,0,15,313173,0,0,3,2015,1,1,False,False


In [18]:
algo_test_classification(X, y)

In [19]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# model
#logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# prediction
y_pred = log_reg.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9992770794007791


In [20]:
#knn
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)


Accuracy: 0.9992770794007791


### Smote

In [21]:
# transform the dataset
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

In [22]:
X, y = oversample.fit_resample(X, y)

In [23]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
failure,124494.0,0.0008514467,0.02916725,0.0,0.0,0.0,0.0,1.0
attribute1,124494.0,122386800.0,70459600.0,0.0,61276754.0,122795744.0,183308370.0,244140480.0
attribute2,124494.0,159.4848,2179.658,0.0,0.0,0.0,0.0,64968.0
attribute3,124494.0,9.940455,185.7473,0.0,0.0,0.0,0.0,24929.0
attribute4,124494.0,1.74112,22.90851,0.0,0.0,0.0,0.0,1666.0
attribute5,124494.0,14.22269,15.94302,1.0,8.0,10.0,12.0,98.0
attribute6,124494.0,260172.9,99151.01,8.0,221452.0,249799.5,310266.0,689161.0
attribute7,124494.0,0.2925282,7.436924,0.0,0.0,0.0,0.0,832.0
attribute8,124494.0,0.2925282,7.436924,0.0,0.0,0.0,0.0,832.0
attribute9,124494.0,12.45152,191.4256,0.0,0.0,0.0,0.0,18701.0


In [24]:
X.head()

Unnamed: 0,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,year,month,day,device_W1F,device_Z1F
0,215630672,56,0,52,6,407438,0,0,7,2015,1,1,False,False
1,61370680,0,3,0,6,403174,0,0,0,2015,1,1,False,False
2,173295968,0,0,0,12,237394,0,0,0,2015,1,1,False,False
3,79694024,0,0,0,6,410186,0,0,0,2015,1,1,False,False
4,135970480,0,0,0,15,313173,0,0,3,2015,1,1,False,False


In [25]:
df['failure'].value_counts()

failure
0    124388
1       106
Name: count, dtype: int64

In [26]:
df.isnull().sum()

failure       0
attribute1    0
attribute2    0
attribute3    0
attribute4    0
attribute5    0
attribute6    0
attribute7    0
attribute8    0
attribute9    0
year          0
month         0
day           0
device_W1F    0
device_Z1F    0
dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   failure     124494 non-null  int64
 1   attribute1  124494 non-null  int64
 2   attribute2  124494 non-null  int64
 3   attribute3  124494 non-null  int64
 4   attribute4  124494 non-null  int64
 5   attribute5  124494 non-null  int64
 6   attribute6  124494 non-null  int64
 7   attribute7  124494 non-null  int64
 8   attribute8  124494 non-null  int64
 9   attribute9  124494 non-null  int64
 10  year        124494 non-null  int32
 11  month       124494 non-null  int32
 12  day         124494 non-null  int32
 13  device_W1F  124494 non-null  bool 
 14  device_Z1F  124494 non-null  bool 
dtypes: bool(2), int32(3), int64(10)
memory usage: 11.2 MB


In [28]:
#knn

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)



Accuracy: 0.9992770794007791
