# Model Building

In [None]:
# In the CSV file add a column names "Target"
# we have 40-50 image of each person in the csv dataset in the image id column
# along the image of the two people who were used for deepfake : person1 - original Video , person 2 - image used for deepfake , give then target score 1 and all the rest of the image
# target score 0
# Hence we will create models for binary classification

# THis step should be done before loading the final csv file.


# U will get a final csv file with following columns
#  Image Name 	SSIM Score 	MSE Error 	MS-SSIM Score 	Euclidean Distance 	Euclidean Distance.1 	Target


In [None]:
## Importing required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import cv2 as cv
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
## reading a dataset
data1 = pd.read_csv("load the csv dataset with extracted similarity scores")

In [None]:
8## Getting first five values of dataset

data1.head()

In [None]:
data1.shape

In [None]:
## Checking for missing values

data1.isnull().sum()

In [None]:
## Checking for duplicated data

data1[data1.duplicated()]

In [None]:
data1.drop(columns='MS-SSIM Score',inplace=True)

In [None]:
data1.drop(columns='Image Name',inplace=True)

In [None]:
## Getting basic information about dataset

data1.info()

In [None]:
## Cheking whether data is balanaced or not

data1['Target'].value_counts()

In [None]:
# the dataset is imbalanced
## breaking data into independent and dependent variable
X=data1.drop(columns=["Target"])
y=data1["Target"]

In [None]:
y

In [None]:
## since  the data is highly imbalance we wil use SMOTE analysis for balancing the dataset

from imblearn.over_sampling import SMOTE

# Perform SMOTE to balance the dataset
smote = SMOTE()
X,y = smote.fit_resample(X,y)


In [None]:
y


In [None]:
print(pd.Series(y).value_counts())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# Calculate correlation matrix
correlation_matrix = data1.corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap using seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)

# Show the plot
plt.title('Correlation Heatmap')
plt.show()


In [None]:
#	SSIM Score 	MSE Error Euclidean Distance 	Euclidean Distance.1 	Target
# Scatter Plot for SSIM Score and MSE Error
plt.scatter(data1['SSIM Score'], data1['MSE Error'])
plt.title('Scatter Plot: SSIM Score vs MSE Error')
plt.xlabel('SSIM Score')
plt.ylabel('MSE Error')
plt.show()
# Histogram for Euclidean Distance
plt.hist(data1['Euclidean Distance'], bins=10, color='blue', edgecolor='black')
plt.title('Histogram: Euclidean Distance')
plt.xlabel('Euclidean Distance')
plt.ylabel('Frequency')
plt.show()

# Scatter Plot for Euclidean Distance and Conv Euclidean Distance
plt.scatter(data1['Euclidean Distance'], data1['Euclidean Distance.1'])
plt.title('Scatter Plot: Euclidean Distance vs Euclidean Distance.1')
plt.xlabel('Euclidean Distance')
plt.ylabel('Euclidean Distance.1')
plt.show()



# Histogram for Euclidean Distance
plt.hist(data1['SSIM Score'], bins=10, color='blue', edgecolor='black')
plt.title('Histogram: SSIM Score')
plt.xlabel('SSIM Score')
plt.ylabel('Frequency')
plt.show()

# Histogram for Euclidean Distance
plt.hist(data1['MSE Error'], bins=10, color='blue', edgecolor='black')
plt.title('Histogram: MSE Error')
plt.xlabel('MSE Error')
plt.ylabel('Frequency')
plt.show()

# Histogram for Euclidean Distance
plt.hist(data1['Euclidean Distance.1'], bins=10, color='blue', edgecolor='black')
plt.title('Histogram: Euclidean Distance.1')
plt.xlabel('Euclidean Distance.1')
plt.ylabel('Frequency')
plt.show()




In [None]:
## Splitting dataset as train test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Create a RandomForestRegressor model (you can also use DecisionTreeRegressor for a single tree)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to display feature names and their importance scores
importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'], importances_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

In [None]:
## Scaling the data with Min_Max_Scaler as the columns as the features does not follow normal distribution and values of each column are in certain range itself.

from sklearn.preprocessing import MinMaxScaler
Min_Max_Scaler = MinMaxScaler()
X_train_run_Min_Max_Scaler = Min_Max_Scaler.fit_transform(X_train)
X_test_run_Min_Max_Scaler = Min_Max_Scaler.transform(X_test)

## Scaling the data with standard scaling

from sklearn.preprocessing import StandardScaler
Standard_Scaler = StandardScaler()
X_train_run_Standard_Scaler = Standard_Scaler.fit_transform(X_train)
X_test_run_Standard_Scaler = Standard_Scaler.transform(X_test)

## Scaling the data with Robust scaler as our data contains lots of outliers and Robust scaler is robust to ouliers

from sklearn.preprocessing import RobustScaler
Robust_Scaler = RobustScaler()
X_train_run_Robust_Scaler = Robust_Scaler.fit_transform(X_train)
X_test_run_Robust_Scaler = Robust_Scaler.transform(X_test)

In [None]:
## Creating a common function to calculate the metrics after prediction
from sklearn.metrics import accuracy_score
def metrics(y_actual,y_predicted):
    accuracy = accuracy_score(y_actual,y_predicted)

    return accuracy

In [None]:
X_data = {"X_without_scaling":[X_train,X_test],
          "X_Min_Max_Scaler":[X_train_run_Min_Max_Scaler,X_test_run_Min_Max_Scaler],
          "X_Standard_Scaler":[X_train_run_Standard_Scaler,X_test_run_Standard_Scaler],
          "X_Robust_Scaler":[X_train_run_Robust_Scaler,X_test_run_Robust_Scaler],
          }

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
def Decision_Tree_Classifier():
    Decision_Tree_Classifier = DecisionTreeClassifier()
    for key,value in X_data.items():
        print('Decision Tree Model')
        print(f'Working with: {key}')
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
        Decision_Tree_Classifier.fit(value[0],y_train)
        print(f'Done with fitting the data using: {key}')
        print(f'Prediction started with: {key}')
        y_predicted = Decision_Tree_Classifier.predict(value[1])
        print(f'Prediction completed with: {key}')
        print(f"Calculating metrics for {key}")
        accuracy1 = metrics(y_test,y_predicted)
        print(f'accuracy: {accuracy1}')

        # Create a confusion matrix
        cm = confusion_matrix(y_test, y_predicted)

        # Plot the confusion matrix
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
        disp.plot(cmap='Reds_r', values_format='d')
        plt.title('Confusion Matrix - Decision Tree')
        plt.show()
        print("Precision:", precision_score(y_test, y_predicted))
        print("Recall:", recall_score(y_test, y_predicted))
        print("F1-score:", f1_score(y_test, y_predicted))
        print("AUC-ROC:", roc_auc_score(y_test, Decision_Tree_Classifier.predict_proba(X_test)[:, 1]))

        print("==================================================================================================================================================")



In [None]:
Decision_Tree_Classifier()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

import numpy as np

def Random_Forest_Classifier():
    Random_Forest_Classifier = RandomForestClassifier()
    for key,value in X_data.items():
        print('Random Forest Model')
        print(f'Working with: {key}')
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
        Random_Forest_Classifier.fit(value[0],y_train)
        print(f'Done with fitting the data using: {key}')
        print(f'Prediction started with: {key}')
        y_predicted = Random_Forest_Classifier.predict(value[1])
        print(f'Prediction completed with: {key}')
        print(f"Calculating metrics for {key}")
        accuracy2 = metrics(y_test,y_predicted)
        print(f'accuracy: {accuracy2}')


        # Create a confusion matrix
        cm = confusion_matrix(y_test, y_predicted)

        # Plot the confusion matrix
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
        disp.plot(cmap='Blues', values_format='d')
        plt.title('Confusion Matrix - Random Forest')
        plt.show()
        print("Precision:", precision_score(y_test, y_predicted))
        print("Recall:", recall_score(y_test, y_predicted))
        print("F1-score:", f1_score(y_test, y_predicted))
        print("AUC-ROC:", roc_auc_score(y_test, Random_Forest_Classifier.predict_proba(X_test)[:, 1]))

        print("==================================================================================================================================================")


In [None]:
Random_Forest_Classifier()