In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("wine_fraud.csv")

In [33]:
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [34]:
X = df.drop(['quality'], axis=1)

# Map 'quality' column to binary values: 'Legit' to 0 and 'Fraud' to 1
df['quality_encoded'] = df['quality'].map({'Legit': 0, 'Fraud': 1})
# Now, you can use 'quality_encoded' as your target variable
y = df['quality_encoded']

In [35]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (10% test size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [36]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_white
1395,8.6,0.685,0.1,1.6,0.092,3.0,12.0,0.99745,3.31,0.65,9.55,0
4393,6.8,0.35,0.53,10.1,0.053,37.0,151.0,0.9963,3.07,0.4,9.4,1
1575,7.5,0.52,0.4,2.2,0.06,12.0,20.0,0.99474,3.26,0.64,11.8,0
603,13.2,0.46,0.52,2.2,0.071,12.0,35.0,1.0006,3.1,0.56,9.0,0
1146,7.8,0.5,0.12,1.8,0.178,6.0,21.0,0.996,3.28,0.87,9.8,0


In [37]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

In [38]:
with open('wine_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [39]:
from imblearn.over_sampling import SMOTE

# Use SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Initialize and train the Logistic Regression model
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
report_logreg = classification_report(y_test, y_pred_logreg)

print(accuracy_logreg)
print(report_logreg)

# Save the trained model as a pickle file
data_to_save = {
    'model': logreg,
    'accuracy': accuracy_logreg
}

with open("logreg_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)

0.7323076923076923
              precision    recall  f1-score   support

           0       0.98      0.74      0.84       623
           1       0.09      0.63      0.16        27

    accuracy                           0.73       650
   macro avg       0.54      0.68      0.50       650
weighted avg       0.94      0.73      0.81       650



In [41]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the k-NN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_knn = knn.predict(X_test_scaled)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)

print(accuracy_knn)
print(report_knn)

# Save the trained model as a pickle file
data_to_save = {
    'model': knn,
    'accuracy': accuracy_knn
}

with open("knn_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)

0.8876923076923077
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       623
           1       0.17      0.44      0.25        27

    accuracy                           0.89       650
   macro avg       0.57      0.68      0.59       650
weighted avg       0.94      0.89      0.91       650



In [42]:
from sklearn.svm import SVC
import pickle

# Initialize and train the SVM model
svm = SVC(class_weight='balanced', kernel='linear', probability=True)
svm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_svm = svm.predict(X_test_scaled)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

print(accuracy_svm)
print(report_svm)

# Save the trained model as a pickle file
data_to_save = {
    'model': svm,
    'accuracy': accuracy_svm
}

with open("svm_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)

0.76
              precision    recall  f1-score   support

           0       0.98      0.77      0.86       623
           1       0.10      0.63      0.18        27

    accuracy                           0.76       650
   macro avg       0.54      0.70      0.52       650
weighted avg       0.94      0.76      0.83       650



In [43]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dtree = dtree.predict(X_test_scaled)

# Evaluate the model
accuracy_dtree = accuracy_score(y_test, y_pred_dtree)
report_dtree = classification_report(y_test, y_pred_dtree)

print(accuracy_dtree)
print(report_dtree)

# Save the trained model as a pickle file
data_to_save = {
    'model': dtree,
    'accuracy': accuracy_dtree
}

with open("dtree_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)


0.9076923076923077
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       623
           1       0.19      0.37      0.25        27

    accuracy                           0.91       650
   macro avg       0.58      0.65      0.60       650
weighted avg       0.94      0.91      0.92       650



In [44]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_rf = rf.predict(X_test_scaled)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print(accuracy_rf)
print(report_rf)

# Save the trained model as a pickle file
data_to_save = {
    'model': rf,
    'accuracy': accuracy_rf
}

with open("rf_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)


0.9646153846153847
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       623
           1       0.61      0.41      0.49        27

    accuracy                           0.96       650
   macro avg       0.79      0.70      0.74       650
weighted avg       0.96      0.96      0.96       650



In [45]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gbm = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_gbm = gbm.predict(X_test_scaled)

# Evaluate the model
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
report_gbm = classification_report(y_test, y_pred_gbm)

print(accuracy_gbm)
print(report_gbm)

# Save the trained model as a pickle file
data_to_save = {
    'model': gbm,
    'accuracy': accuracy_gbm
}

with open("gbm_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)


0.8553846153846154
              precision    recall  f1-score   support

           0       0.98      0.86      0.92       623
           1       0.17      0.67      0.28        27

    accuracy                           0.86       650
   macro avg       0.58      0.77      0.60       650
weighted avg       0.95      0.86      0.89       650



In [46]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize and train the AdaBoost model
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_adaboost = adaboost.predict(X_test_scaled)

# Evaluate the model
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
report_adaboost = classification_report(y_test, y_pred_adaboost)

print(accuracy_adaboost)
print(report_adaboost)

# Save the trained model as a pickle file
data_to_save = {
    'model': adaboost,
    'accuracy': accuracy_adaboost
}

with open("adaboost_wine_model.pkl", 'wb') as file:
    pickle.dump(data_to_save, file)


0.8307692307692308
              precision    recall  f1-score   support

           0       0.98      0.84      0.90       623
           1       0.15      0.63      0.24        27

    accuracy                           0.83       650
   macro avg       0.56      0.73      0.57       650
weighted avg       0.95      0.83      0.88       650

