In [64]:
from google.colab import drive
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

In [41]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Step 1: Load the training data

In [42]:
train_data = pd.read_csv('/content/gdrive/MyDrive/train_new.csv')

In [43]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094 entries, 0 to 1093
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price                         1094 non-null   float64
 1   retail_price                  1094 non-null   int64  
 2   currency_buyer                1094 non-null   object 
 3   units_sold                    1094 non-null   int64  
 4   uses_ad_boosts                1094 non-null   int64  
 5   rating                        1094 non-null   float64
 6   rating_count                  1094 non-null   int64  
 7   badges_count                  1094 non-null   int64  
 8   badge_local_product           1094 non-null   int64  
 9   badge_product_quality         1094 non-null   int64  
 10  badge_fast_shipping           1094 non-null   int64  
 11  tags                          1094 non-null   object 
 12  product_color                 1065 non-null   object 
 13  pro

# Step 2: Preprocessing

In [44]:
# Drop irrelevant columns
train_data = train_data.drop(['currency_buyer', 'tags', 'product_color', 'product_variation_size_id',
                              'shipping_option_name', 'urgency_text', 'origin_country', 'merchant_title',
                              'merchant_name', 'merchant_info_subtitle', 'merchant_id', 'merchant_has_profile_picture',
                              'merchant_profile_picture', 'theme', 'crawl_month', 'id', 'units_sold'], axis=1)



In [45]:
# Convert categorical features to numerical using one-hot encoding
train_data = pd.get_dummies(train_data)

In [46]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
train_data_imputed = imputer.fit_transform(train_data)

In [47]:
# Normalize the features
scaler = preprocessing.MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data_imputed )

In [48]:
# Split the data into features and target variable
X_train = train_data_scaled
y_train = pd.read_csv('/content/gdrive/MyDrive/train_new.csv')['units_sold']

# Step 3: Load the testing data


In [49]:
test_data = pd.read_csv('/content/gdrive/MyDrive/test_new.csv')

# Step 4: Preprocessing for testing data


In [50]:
# Drop irrelevant columns
test_data = test_data.drop(['currency_buyer', 'tags', 'product_color', 'product_variation_size_id',
                            'shipping_option_name', 'urgency_text', 'origin_country', 'merchant_title',
                            'merchant_name', 'merchant_info_subtitle', 'merchant_id', 'merchant_has_profile_picture',
                            'merchant_profile_picture', 'theme', 'crawl_month', 'id', 'units_sold'], axis=1)



In [51]:
# Convert categorical features to numerical using one-hot encoding
test_data = pd.get_dummies(test_data)



In [52]:
# Reorder test data columns to match training data columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_data.columns]

In [53]:
# Impute missing values
test_data_imputed = imputer.transform(test_data)

In [54]:
# Normalize the features
test_data_scaled = scaler.transform(test_data_imputed)

In [62]:
# Split the data into features and target variable
X_test = test_data_scaled
y_test = pd.read_csv('/content/gdrive/MyDrive/test_new.csv')['units_sold']

# Step 5: Train and evaluate models

## Decision Tree

### Model 1: Default configuration

In [65]:
dt_model_1 = DecisionTreeClassifier(random_state=42)
dt_model_1.fit(X_train, y_train)
dt_pred_1 = dt_model_1.predict(X_test)
dt_acc_1 = accuracy_score(y_test, dt_pred_1)
dt_f1_1 = f1_score(y_test, dt_pred_1, average='weighted')


### Model 2: Tuned configuration


In [66]:
dt_model_2 = DecisionTreeClassifier(max_depth=5, min_samples_split=2, random_state=42)
dt_model_2.fit(X_train, y_train)
dt_pred_2 = dt_model_2.predict(X_test)
dt_acc_2 = accuracy_score(y_test, dt_pred_2)
dt_f1_2 = f1_score(y_test, dt_pred_2, average='weighted')


## SVM

### Model 1: Default configuration


In [67]:

svm_model_1 = SVC(random_state=42)
svm_model_1.fit(X_train, y_train)
svm_pred_1 = svm_model_1.predict(X_test)
svm_acc_1 = accuracy_score(y_test, svm_pred_1)
svm_f1_1 = f1_score(y_test, svm_pred_1, average='weighted')

### Model 2: Tuned configuration

In [68]:
svm_model_2 = SVC(kernel='poly', degree=3, random_state=42)
svm_model_2.fit(X_train, y_train)
svm_pred_2 = svm_model_2.predict(X_test)
svm_acc_2 = accuracy_score(y_test, svm_pred_2)
svm_f1_2 = f1_score(y_test, svm_pred_2, average='weighted')

## Naive Bayes

In [69]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred, average='weighted')

In [70]:
# Print the F1-Scores
print("Decision Tree:")
print("Model 1 F1-Score:", dt_f1_1)
print("Model 2 F1-Score:", dt_f1_2)
print("")

print("SVM:")
print("Model 1 F1-Score:", svm_f1_1)
print("Model 2 F1-Score:", svm_f1_2)
print("")

print("Naive Bayes:")
print("F1-Score:", nb_f1)

Decision Tree:
Model 1 F1-Score: 0.6384595388705425
Model 2 F1-Score: 0.7028240817292348

SVM:
Model 1 F1-Score: 0.1838527884331019
Model 2 F1-Score: 0.21912105134329307

Naive Bayes:
F1-Score: 0.21963166199365086


In [71]:
# Print the accuracies
print("Decision Tree:")
print("Model 1 Accuracy:", dt_acc_1)
print("Model 2 Accuracy:", dt_acc_2)
print("")

print("SVM:")
print("Model 1 Accuracy:", svm_acc_1)
print("Model 2 Accuracy:", svm_acc_2)
print("")

print("Naive Bayes:")
print("Accuracy:", nb_acc)

Decision Tree:
Model 1 Accuracy: 0.6263048016701461
Model 2 Accuracy: 0.7160751565762005

SVM:
Model 1 Accuracy: 0.33820459290187893
Model 2 Accuracy: 0.35490605427974947

Naive Bayes:
Accuracy: 0.2797494780793319


#Questions

1.	Why Data Mining is a misnomer? What is another preferred name?



Data Mining is considered a misnomer because it implies extracting data from a source, whereas the process involves discovering patterns and insights from data. Another preferred name is Knowledge Discovery in Databases (KDD)

2.	 What is the general knowledge discovery process? What is the difference between a data engineer and data scientist/AI engineer?


The general knowledge discovery process involves data selection, preprocessing, transformation, data mining, interpretation, and evaluation. A data engineer focuses on the design and maintenance of data pipelines and infrastructure, while a data scientist/AI engineer is responsible for developing algorithms and models to extract insights from data.

3.	In data mining, what is the difference between prediction and categorization?


In data mining, prediction involves forecasting numerical values, while categorization involves assigning data into predefined classes or categories.

4.	Why data science/machine learning is a bad idea in the context of information security?


Data science/machine learning can be a bad idea in information security if not implemented carefully, as it can introduce vulnerabilities if models are trained on sensitive data or manipulated by adversaries.


5.	What is CIA principle and how can we use it to access the security/privacy aspect of the AI




The CIA principle stands for Confidentiality, Integrity, and Availability. We can use it to assess the security/privacy aspect of AI systems/pipelines by ensuring that data is kept confidential, that the system maintains integrity against tampering or unauthorized access, and that it remains available for legitimate users when needed.