In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
#url of the dataset
url="https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Glass%20Identification/Glass%20Identification.csv"

# Column names for the dataset
column_names = ['Id','RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']

df=pd.read_csv(url,names=column_names,index_col="Id")
df

# Exploratory Data Analysis (EDA)

In [None]:
df.shape

The Glass Dataset contains 214 rows and 10 columns

In [None]:
df.columns

In [None]:
#checking the datatypes of columns
df.dtypes

 Here only numerical columns are present as (float64,int64) in dataset

In [None]:
df.head()

In [None]:
df.tail(15)

# np.nan,None,NaN,others

In [None]:
#Checking the null value in the dataset
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.info()

Here from the above we get information that there is no null value present and all the features in the data type is float and int basically numerical dtypes and memory used is 18.4 kb 

In [None]:
#lets visualize it using heatmap
sns.heatmap(df.isnull())

we have now enough evidence that there is no null value present in the dataset

In [None]:
# checking the unique value 
df["Type"].unique()

In [None]:
df["Type"].nunique()

 The problem statement instruction is that the dataset can be divided into window glass (classes 1-4) and non-window glass (classes 5-7).
    
1- building_windows_float_processed
2- building_windows_non_float_processed
3- vehicle_windows_float_processed
4- vehicle_windows_non_float_processed (none in this database)
5- containers
6- tableware
7- headlamps



In [None]:
# Step 1: Create binary target variable
df['Glass_type'] = df['Type'].apply(lambda x: 1 if x >= 4 else 0)

Glass_type 1-4  now converted into categories 0  and 5-7 into categories 1

In [None]:
df["Glass_type"].nunique()

In [None]:
#Checking the value count of each variabe in the table

for i in df:
    print(df[i].value_counts())
    print("\n")

From this above input found that there is biaseness pressent in the target column for 0 group and Iron and Barium has highest for features with 0.00 

In [None]:
df.dtypes.index

In [None]:
# Dropping Type column from the table as there is same information provided in the Glass_type

df.drop("Type",axis=1,inplace=True)

In [None]:
df.dtypes.index

In [None]:
#checking number of unique values in Glass dataset
df.nunique().to_frame("No. of unique value")

# Description of Dataset

In [None]:
# Statistical summary of numerical columns
df.describe()

Data distribution is uniform and outliers present in  ca column 

In [None]:
plt.figure(figsize=(22,10))
sns.heatmap(df.describe(),annot=True,fmt="0.2f",linewidth=0.2,linecolor="black",cmap="Spectral")
plt.xlabel("Features Names",fontsize=14)
plt.ylabel("Descriptive Stats",fontsize=14)
plt.title("Descriptive Graph",fontsize=20)
plt.show()

Si has high value present in this column compare to other so require standard scaler for scaling table

# Data Visualization

# Univariate Analysis

In [None]:
#Visualize the number of label target as 

sns.countplot(x='Glass_type',data=df)
print(df["Glass_type"].value_counts())

In [None]:
for i in df:
    plt.figure(figsize=(12,3),facecolor="white")
    sns.countplot(x=i,data=df)
plt.show()

In [None]:
#Checking the data Distribution

# Lets check how the data has been distributed in numerical column

plt.figure(figsize=(10,6),facecolor="white")
plotnum=1
for col in df:
    if plotnum<=10:
        plt.subplot(2,5,plotnum)
        sns.distplot(df[col],color="m")
        plt.xlabel(col,fontsize = 10)
        plt.yticks(rotation=0, fontsize = 10)
    plotnum+=1
plt.show()

Fe,Ba,Mg,k these features show inequality in count majorily

# Bivariate Anlaysis

In [None]:
sns.lmplot(x="Glass_type",y="RI",data=df,palette="colorblind")

In [None]:
#checking relationship using Bar Plot

plt.figure(figsize=(20,25),facecolor="white")
p=1
for i in df:
    if p<=10:
        plt.subplot(5,4,p)
        sns.barplot(x="Glass_type",y=i, data= df)
        plt.xlabel("Glass Type")
        plt.ylabel(i)
    p+=1
plt.show()
    


In [None]:
plt.figure(figsize=(6,2),facecolor="white")

sns.lmplot(x="Na",y="RI",data=df,palette="colorblind")
plt.xlabel("Na")
plt.ylabel("RI")

In [None]:
#Comparing Glass_type and Na
plt.title("Comparing Glass_type and Na")
sns.stripplot(x='Glass_type', y='Na',data =df)
plt.show()

In [None]:
plt.figure(figsize=(20,25),facecolor="white")
p=1
for i in df:
    if p<=10:
        plt.subplot(5,4,p)
        sns.stripplot(x="Glass_type",y=i, data= df)
        plt.xlabel("Glass Type")
        plt.ylabel(i)
    p+=1
plt.show()

In [None]:
plt.figure(figsize=(20,25),facecolor="white")
p=1
for i in df:
    if p<=10:
        plt.subplot(5,4,p)
        sns.scatterplot(x="Glass_type",y=i, data= df)
        plt.xlabel("Glass Type")
        plt.ylabel(i)
    p+=1
plt.show()

In [None]:
#Comparing between Glass_type with Mg and Si
plt.title("Comparing between Glass_type with Mg and Si")
sns.scatterplot(x="Mg",y="Si",data=df,hue="Glass_type",palette="bright")
plt.show()

In [None]:
#Comparing between Glass_type with RI and Ca
plt.title("Comparing between Glass_type with RI and Ca")
sns.scatterplot(x="RI",y="Ca",data=df,hue="Glass_type",palette="bright")
plt.show()

In [None]:
#Comparing between Glass_type with  Na and Si
plt.title("Comparing between Glass_type with  Na and Si")
sns.scatterplot(x="Na",y="Si",data=df,hue="Glass_type",palette="bright")
plt.show()

Here we can analyse that high Mg value are good for 0 type and vice-versa ,for si

In [None]:
plt.figure(figsize=(20,25),facecolor="white")
p=1
for i in df:
    if p<=10:
        plt.subplot(5,4,p)
        sns.histplot(x="Glass_type",y=i, data= df)
        plt.xlabel("Glass Type")
        plt.ylabel(i)
    p+=1
plt.show()

# Multivariate Analysis

In [None]:
sns.pairplot(data=df)

# Checking for outliers

In [None]:
#lets check the outliers by plotting boxplot.


plotnumber=1
for col in df.columns:
    if plotnumber<=12:
        plt.subplot(2,5,plotnumber)
        sns.boxplot(y=df[col], palette = "Set2_r")
        plt.xlabel(col, fontsize = 5)
        plt.yticks(rotation=2, fontsize =2)
    plotnumber+=1
    
plt.tight_layout()
plt.show()

As we can seen there is outliers present in all columns except Mg

In [None]:
from scipy.stats import zscore
out_features=df.iloc[:,:]
z=np.abs(zscore(out_features))
z

In [None]:
#threshold z>3
np.where(z>3)

In [None]:
z.iloc[105,6]

In [None]:
z.iloc[106,0]

In [None]:
df1=df[(z<3).all(axis=1)]

In [None]:
df1.shape

In [None]:
print("Old Data Frame:",df.shape[0])
print("New Data Frame:",df1.shape[0])

In [None]:
print("The loss of data in percentage",((df.shape[0]-df1.shape[0])/df.shape[0])*100)

the loss of data is below 10% as the limited data set 

# Checking for Skewness

In [None]:
df1.skew()

Removing skewness using cube root method

In [None]:
df1["Mg"]= np.cbrt(df1["Mg"])
df1["Ca"]= np.cbrt(df1["Ca"])
df1["Ba"]= np.cbrt(df1["Ba"])
df1["Fe"]= np.cbrt(df1["Fe"])

In [None]:
df1.skew()

In [None]:
plt.figure(figsize=(10,15),facecolor="red")
plotnumber=1

for column in df1:
    if plotnumber<=10:
        plt.subplot(2,5,plotnumber)
        sns.distplot(df1[column],color='green')
        plt.xlabel(column,fontsize=10)
    plotnumber+=1
plt.show()

# Correlation b/w target variable and independent variable

# Multicollinearity

In [None]:
cor=df1.corr()
cor

In [None]:
#Visualizing the correlation marix by using  heatmap

plt.figure(figsize=(26,14))
sns.heatmap(cor,annot=True,fmt="0.2f",linewidth = 0.2,linecolor="black", cmap="Spectral")
plt.xlabel("Figure",fontsize=14)
plt.ylabel("Features_Name", fontsize=14)
plt.title("Descriptive Graph", fontsize=20)
plt.show()

In [None]:
cor["Glass_type"].sort_values(ascending=False)

# Visualizing the correlation between label and features using bar plot

In [None]:
#Visualizing the correlation between label and features using bar plot

plt.figure(figsize=(22,7))
cor["Glass_type"].sort_values(ascending=False).drop(["Glass_type"]).plot(kind='bar',color='b')
plt.xlabel("Feature",fontsize=15)
plt.ylabel("Target",fontsize=15)
plt.title("Visualizing the correlation between label and features using bar plot")
plt.show()

The Graph represents positive relation with Al,Ba,Na,Si,Ca in descending order and negative correaltion k,Mg,Fe,RI in ascending order

# Separating features and label

In [None]:
x= df1.iloc[:,0:9]
y=df1.iloc[:,-1]

In [None]:
x

In [None]:
y

# Feature Scaling using Standard Scalarization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
x

# Checking Variance Inflation Factor(VIF)

In [None]:
# Finding variance inflation factor in each scaled column

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif["VIF values"]=[variance_inflation_factor(x.values, i) for i in range(len(x.columns))]
vif["Features"]=x.columns

vif

As VIF value of Happiness rank Economy and dystopia is high compared to other Features but as the limited dataset cannot drop any feature for model building

In [None]:
y.value_counts()

Here we can observe that the data for target column is not balanced or biasness is present

# Oversampling

In [None]:
from imblearn.over_sampling import SMOTE
SM=SMOTE()
x1,y1=SM.fit_resample(x,y)

In [None]:
x1.value_counts()

In [None]:
y1.value_counts()

# Modelling 

In [None]:
#Finding the best random state
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

maxAccu=0
masRS=0

for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x1,y1,test_size=0.2,random_state=i)
    RFR = RandomForestClassifier()
    RFR.fit(x_train,y_train)
    predRFR=RFR.predict(x_test)
    acc=accuracy_score(y_test,predRFR)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
        
print("Best accuracy is",maxAccu,"at random state", maxRS)


In [None]:
#Creating train test split 

x_train,x_test,y_train,y_test=train_test_split(x1,y1,test_size=0.20,random_state=maxRS)

# Classification Algorithms

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,accuracy_score
from sklearn.model_selection import cross_val_score


# RandomForestClassifier

In [None]:
#Checking accuracy for RandomForestClassifier
RFC= RandomForestClassifier()
RFC.fit(x_train,y_train)
predRFC= RFC.predict(x_test)
print(accuracy_score(y_test,predRFC))
print(confusion_matrix(y_test,predRFC))
print(classification_report(y_test,predRFC))

In [None]:
#Graphical confusion matrix for RandomForestClassifier

cm=confusion_matrix(y_test,predRFC)

x_axis_label=["0","1"]
y_axis_label=["0","1"]

fig,ax=plt.subplots(figsize=(7,7))
sns.heatmap(cm, annot= True,linewidths = 0.2, linecolor="black",fmt=".0f",ax=ax,xticklabels=x_axis_label,yticklabels=y_axis_label)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for RandomForestClassifier")
plt.show()

# LogisticRegression

In [None]:
##Checking accuracy for LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
predLR= LR.predict(x_test)
print(accuracy_score(y_test,predLR))
print(confusion_matrix(y_test,predLR))
print(classification_report(y_test,predLR))

# Support Vector Machine Classifier

In [None]:
#Checking accuracy for Support Vector Machine Classifier
svc=SVC()
svc.fit(x_train,y_train)
predsvc= svc.predict(x_test)
print(accuracy_score(y_test,predsvc))
print(confusion_matrix(y_test,predsvc))
print(classification_report(y_test,predsvc))


# Gradient Boosting Classifier

In [None]:
#Checking accuracy for Gradient Boosting Classifier
GB=GradientBoostingClassifier()
GB.fit(x_train,y_train)
predGB= GB.predict(x_test)
print(accuracy_score(y_test,predGB))
print(confusion_matrix(y_test,predGB))
print(classification_report(y_test,predGB))

# AdaBoostClassifier

In [None]:
#Checking accuracy for AdaBoostClassifier
ABC=AdaBoostClassifier()
ABC.fit(x_train,y_train)
predABC= ABC.predict(x_test)
print(accuracy_score(y_test,predABC))
print(confusion_matrix(y_test,predABC))
print(classification_report(y_test,predABC))

# BaggingClassifier

In [None]:
#Checking accuracy for BaggingClassifier
BC=BaggingClassifier()
BC.fit(x_train,y_train)
predBC= BC.predict(x_test)
print(accuracy_score(y_test,predBC))
print(confusion_matrix(y_test,predBC))
print(classification_report(y_test,predBC))

# ExtraTreesClassifier

In [None]:
#Checking accuracy for ExtraTreesClassifier

ET=ExtraTreesClassifier()
ET.fit(x_train,y_train)
predET= ET.predict(x_test)
print(accuracy_score(y_test,predET))
print(confusion_matrix(y_test,predET))
print(classification_report(y_test,predET))

# Cross validation Score

In [None]:
from sklearn.model_selection import cross_val_score 

In [None]:
#checking cv score for random forest

score=cross_val_score(RFC,x1,y1)
print(score)
print(score.mean())
print("Difference between Accuracy score and cross validation score is", accuracy_score(y_test,predRFC)-score.mean())

In [None]:
#Checking cv score for Logistic classifier
score1=cross_val_score(LR,x1,y1)
print(score1)
print(score1.mean())
print("Difference between Accuracy score and Cross validation score is :",accuracy_score(y_test,predLR)-score1.mean())

In [None]:
#Checking cv score for Bagging Classifier
score2=cross_val_score(BC,x1,y1)
print(score2)
print(score2.mean())
print("Difference between Accuracy score and Cross validation score is :", accuracy_score(y_test, predBC)-score2.mean())

In [None]:
#Checking cv score for ExtraTrees classifier
score3=cross_val_score(ET,x1,y1)
print(score3)
print(score3.mean())
print("Difference between Accuracy score and Cross validation score is :",accuracy_score(y_test,predET)-score3.mean())

In [None]:
# Checking cv score for GB classifier
score4=cross_val_score(GB,x1,y1)
print(score4)
print(score4.mean())
print("Difference between Accuracy score and Cross validation score is :",accuracy_score(y_test,predGB)-score4.mean())

In [None]:
#Checking cv score for GB classifier
score5=cross_val_score(svc,x1,y1)
print(score5)
print(score5.mean())
print("Difference between Accuracy score and Cross validation score is :",accuracy_score(y_test,predsvc)-score5.mean())

Overall ExtraTreeClassifier is perfroming best in cross validation 

# Hyper Parameter Tuning

In [None]:
# ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {"criterion" : ["gini","entropy"],
             "random_state" : [10,50,1000],
             "max_depth" : [None,10,20],
             "n_jobs" : [-2,-1,1],
             "n_estimators" : [50,100,200,300]}

In [None]:
GCV= GridSearchCV(ExtraTreesClassifier(),parameters, cv = 5)

In [None]:
GCV.fit(x_train,y_train)

In [None]:
GCV.best_params_

In [None]:
ET_model=ExtraTreesClassifier(criterion = "entropy",max_depth = None, n_jobs =-2, n_estimators= 100, random_state= 50)
ET_model.fit(x_train,y_train)
pred=ET_model.predict(x_test)
acc=accuracy_score(y_test,pred)
print(acc*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
ET_model=ExtraTreesClassifier(criterion = "gini",max_depth = None, n_jobs =-2, n_estimators= 50, random_state= 50)
ET_model.fit(x_train,y_train)
pred=ET_model.predict(x_test)
acc=accuracy_score(y_test,pred)
print(acc*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

# AUC ROC CURVE

In [None]:
from sklearn.metrics import roc_curve

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [None]:
y_pred_prob = ET_model.predict_proba(x_test)[:, 1]

In [None]:
y_pred_prob

In [None]:
fpr,tpr,thresholds=roc_curve(y_test,y_pred_prob)

In [None]:
fpr

In [None]:
fpr

In [None]:
thresholds

In [None]:
plt.plot([0,1],[0,1],"k--")
plt.plot(fpr,tpr,label="ExtraTreeClassifier")
plt.xlabel("False postive rate")
plt.ylabel("True positive rate")
plt.title("confusion matrix")
plt.show()

In [None]:
auc_score=roc_auc_score(y_test,ET_model.predict(x_test))

In [None]:
print(auc_score)

# Saving the model

In [None]:
#Saving ET model using pickle
import pickle
filename="Glass_identification.pkl"
pickle.dump(ET_model,open(filename,"wb")) 

In [None]:
load_model=pickle.load(open("Glass_identification.pkl","rb"))
result=load_model.score(x_test,y_test)
print(result*100)

In [None]:
conclusion =pd.DataFrame([load_model.predict(x_test)[:],y_test[:]],index=["Predicted","Original"])
conclusion