# Train a model using the data that classifies whether the web element is product item (is_shop label) and analyze the results. 

In [1]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import  confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_excel('data-for-task1.xlsx')

In [3]:
#  below data are not useful for predictions 
df.drop(["builder_flags"] , inplace = True , axis = 1 )
df.drop(["prediction_data"] , inplace = True , axis = 1 )
df.drop(["content"] , inplace = True , axis = 1 )
df.drop(["contentAfter"] , inplace = True , axis = 1 )
df.drop(["contentBefore"] , inplace = True , axis = 1 )
df.drop(["cssBackgroundImageLink"] , inplace = True , axis = 1 )
df.drop(["cssHasborderBottomWidth"] , inplace = True , axis = 1 )
df.drop(["cssHasborderRightWidth"] , inplace = True , axis = 1 )
df.drop(["builder"] , inplace = True , axis = 1 )
df.drop(["modified"] , inplace = True , axis = 1 )
df.drop(["localCharCount"] , inplace = True , axis = 1 )
df.drop(["id"] , inplace = True , axis = 1 )
df.drop(["cssHasAnimation"] , inplace = True , axis = 1 )
df.drop(["cssBackgroundImage"] , inplace = True , axis = 1 )
df.drop(["allPseudoImageGlobal"] , inplace = True , axis = 1 )
df.drop(["allPseudoBG"] , inplace = True , axis = 1 )
df.drop(["allPseudoImageLocal"] , inplace = True , axis = 1 )

In [4]:
#  data  have NAN writtern as missing_value so I change it to Nan for later filling it 
df = df.replace('missing_value', np.nan)
#  the data is littele inbalanced so let's correct it
df['is_shop'].value_counts()

1    1114
0     886
Name: is_shop, dtype: int64

## Spliting the data 

In [5]:
# Separate the target and dataset
X = df.drop('is_shop', axis=1)
y = df['is_shop']

# Split the data into train and test
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)


## doing  SimpleImputer

In [6]:
# Seperate non-numeric and numeric columns 
numeric_columns = xtrain.select_dtypes(include=[np.number]).columns
non_numeric_columns = xtrain.select_dtypes(exclude=[np.number]).columns

#  SimpleImputer with the strategy 'mean' for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
xtrain[numeric_columns] = imputer_numeric.fit_transform(xtrain[numeric_columns])
xtest[numeric_columns] = imputer_numeric.transform(xtest[numeric_columns])

# For non-numeric do SimpleImputer with the most_frequent strategy
imputer_non_numeric = SimpleImputer(strategy='most_frequent')
xtrain[non_numeric_columns] = imputer_non_numeric.fit_transform(xtrain[non_numeric_columns])
xtest[non_numeric_columns] = imputer_non_numeric.transform(xtest[non_numeric_columns])


## doing  OneHotEncoder()

In [7]:
# Perform one-hot encoding on non-numeric columns for both xtrain and xtest
combined_data = pd.concat([xtrain, xtest], axis=0)
combined_data = pd.get_dummies(combined_data)
xtrain = combined_data[:len(xtrain)]
xtest = combined_data[len(xtrain):]


## do StandardScaler()

In [8]:
# perform  StandardScaler fit and transofrm 
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)



In [9]:
def print_metrics(tp, fp, tn, fn):
    Accuracy =  (tp + tn) / (tp + tn + fn + fp)
    Precision= tp / (tp + fp)
    Recall= tp / (tp + fn)
    Specificity =  tn / (tn + fp)
    F1_Score= (tp)/(tp+(fp+fn)/2)
    return Accuracy,  Precision , Recall , Specificity , F1_Score

## If the model incorrectly identifies a shop as a non-shop, it could result in missed business opportunities or losses that is why I gave more importance to Recall. 

### after trying many different algorithms AdaBoostClassifier() gave the best result above all

In [10]:
adaBoost = AdaBoostClassifier()

adaBoost.fit(xtrain, ytrain)
y_pred = adaBoost.predict(xtest)
adaBoost.score(xtest, ytest)

tn, fp, fn, tp = confusion_matrix(ytest, y_pred).ravel()
Accuracy,  Precision , Recall , Specificity , F1_Score = print_metrics(tp, fp, tn, fn)
print(f"Accuracy: {Accuracy} \nPrecision : {Precision} \nRecall: {Recall} \nSpecificity: {Specificity} \nF1_Score : {F1_Score}")


Accuracy: 0.98 
Precision : 1.0 
Recall: 0.963302752293578 
Specificity: 1.0 
F1_Score : 0.9813084112149533
