In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [2]:

# Define the path to your CSV file
current_directory= os.getcwd()
csv_file_path = os.path.join(current_directory, 'audio_features.csv')

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)


In [3]:
#creating variables
#X = df.drop(columns=['origin_sample','LABEL'])
#y = df['LABEL']

In [4]:
#label encoding target variable

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
df['LABEL'] = label_encoder.fit_transform(df['LABEL'])

# Display the encoded target variable
print(len(df['LABEL']))

29965


In [5]:
df

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,origin_sample,LABEL
0,0.323009,0.084976,2319.995862,2111.661159,4327.684437,0.120051,-259.21777,78.556450,-28.437014,2.343593,...,-9.411841,-11.343761,-10.596205,-17.319010,-13.092068,-5.539229,-7.363791,-1.501475,obama-to-margot,0
1,0.423175,0.003972,1698.546281,1767.665114,3107.143333,0.087891,-509.25980,125.234000,-40.211136,22.305662,...,-1.983171,-4.219697,-3.560827,-5.166744,-5.744504,-4.140846,-2.287184,-0.604480,obama-to-margot,0
2,0.432377,0.039095,1959.195873,1719.602240,2952.006392,0.129594,-454.82706,48.612410,-18.004953,-0.634527,...,-3.001686,-0.266156,2.898851,-3.423814,-5.140369,-0.408738,1.165877,1.589706,obama-to-margot,0
3,0.269118,0.117455,1548.276335,1621.984655,2581.292725,0.065829,-276.99222,94.852295,-12.124289,0.087424,...,-14.289590,-17.176582,-8.580988,-18.582733,-15.325979,-3.136350,-9.455655,-5.202446,obama-to-margot,0
4,0.196171,0.173183,1780.443851,1929.738480,3402.735485,0.075950,-204.80025,88.516510,-28.676355,20.349049,...,-10.107014,-22.268768,-8.958087,-20.369590,-13.021211,1.593888,-12.371036,-5.295657,obama-to-margot,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29960,0.280671,0.034478,2315.657034,2018.093173,4039.677845,0.138605,-327.29346,90.603910,-22.791170,23.153254,...,-12.792680,-2.988621,-7.036581,-14.832648,-4.897994,-0.409182,-14.722064,-5.194529,margot-original,1
29961,0.231114,0.027242,1917.601308,2051.525358,3667.006614,0.079889,-330.09192,91.263800,-12.174102,41.212467,...,-9.736492,-7.072990,-9.002355,-11.063793,-9.130094,5.505251,-13.212198,-5.920647,margot-original,1
29962,0.312383,0.024247,2268.438852,2056.983047,3829.973810,0.135276,-313.27980,104.575760,-3.543632,15.500204,...,-13.444408,-6.151810,-2.966465,-11.603537,-12.675177,2.122751,-12.581823,-6.497315,margot-original,1
29963,0.414316,0.023162,3179.381155,2485.592179,5684.276234,0.199596,-301.73703,69.163140,27.108448,18.234220,...,-9.148107,-2.640517,-7.639958,-14.637268,-10.325658,-0.970226,-13.365485,-7.056538,margot-original,1


In [6]:
df['LABEL'].value_counts()
#0- fake and 1-real

LABEL
0    26215
1     3750
Name: count, dtype: int64

In [7]:
#split the datainto train and test

# Split the data into training and testing sets (default split is 70% training, 30% testing)
df_train, df_test =  train_test_split(df, test_size=0.3, random_state=42, stratify=df['LABEL'])

In [8]:
df_train['LABEL'].value_counts()

LABEL
0    18350
1     2625
Name: count, dtype: int64

In [9]:
print(len(df_train))
print(len(df_test))

20975
8990


In [10]:

train_data = df_train
# Separate the majority (FAKE) and minority (REAL) classes
fake_class = train_data[train_data['LABEL'] == 0]
real_class = train_data[train_data['LABEL'] == 1]

# Undersample the majority class (FAKE) using random sampling
fake_class_undersampled = resample(fake_class, replace=False, n_samples=len(real_class), random_state=42)

# Combine the undersampled FAKE class with the original REAL class
undersampled_data = pd.concat([fake_class_undersampled, real_class])

# Separate features (X_undersampled) and target variable (y_undersampled)
X_undersampled = undersampled_data.drop('LABEL', axis=1)
y_undersampled = undersampled_data['LABEL']

# Display the count of each class after undersampling
print(y_undersampled.value_counts())


LABEL
0    2625
1    2625
Name: count, dtype: int64


y_undersampled.value_counts().plot(kind='bar', title='Count (target)');

In [11]:
X_undersampled = X_undersampled.drop(columns=['origin_sample'])
X_undersampled

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20
1755,0.378776,0.009581,2439.394141,2261.605621,4637.224232,0.135898,-439.17920,90.368090,-9.830955,28.163680,...,-7.750748,-3.840950,1.890240,0.885252,-8.487972,2.289786,-7.791063,-1.626170,0.001303,-1.933922
13114,0.324630,0.085079,1515.986777,1628.864312,2611.145574,0.061213,-252.66322,127.877815,-41.027580,30.724010,...,-15.463946,-2.051590,2.705613,-2.253162,-8.021874,1.946832,-6.405579,-6.663895,-3.887709,-0.550035
18716,0.440891,0.026685,1729.927947,2200.410352,3252.981845,0.081099,-306.67550,132.440500,8.766188,-5.409053,...,-6.789303,1.338534,-5.413327,-3.642393,-8.270845,-5.779639,-8.232102,-1.112036,-5.611495,0.873516
18769,0.504256,0.027519,2493.476845,2363.071737,4900.271884,0.150957,-353.85060,88.782410,-22.743261,12.364288,...,-7.547710,2.792495,-3.185458,-0.496476,-5.606109,3.829643,-4.958244,-6.448803,-0.384502,3.312488
19098,0.244805,0.132763,3097.069916,2496.974711,5568.290572,0.160578,-201.07748,34.393276,17.091759,8.240293,...,-13.103765,-2.631081,-9.276155,-9.429006,-6.026335,-9.666129,-15.713352,-3.245495,-17.074547,-9.586217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27286,0.348612,0.083094,971.482327,977.426551,1822.247314,0.048451,-249.40819,196.119230,-46.346110,33.038635,...,-3.098921,10.212345,2.221896,1.650074,7.844559,-1.145859,-10.649522,-4.188498,4.455613,-3.820272
29603,0.484476,0.001540,4113.797828,3107.553945,8038.491544,0.270175,-460.18250,31.085356,4.354464,13.600013,...,-22.609324,-8.713697,-6.741081,-7.933905,-14.002439,-5.912973,-8.120257,-4.164864,-9.511443,-11.217751
27231,0.364256,0.088496,1121.817457,1077.728279,2071.592019,0.071145,-232.69554,192.689960,-42.874275,34.963886,...,3.738820,6.082582,-2.577122,4.670003,-7.230110,2.941084,2.398779,-7.682805,1.637422,-5.165213
26404,0.409651,0.044545,1626.137387,1587.010419,2992.136452,0.076483,-282.54288,148.937560,-13.158088,37.411660,...,-10.115785,-2.887369,1.673074,6.036693,0.210025,-2.319819,7.011754,7.468063,1.028172,2.407341


**model XGBoost with GRIDSEARCH**

In [12]:
X_test = df_test.drop(columns=['LABEL','origin_sample'])
y_test = df_test['LABEL']
#y_test

**#model evaluation using recall metric:**

In [13]:

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': range(200,400,50),
    'max_depth': range(4,7),
    'learning_rate': [0.2, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28],
    }

# Create the GridSearchCV object
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,  # Number of cross-validation folds
    scoring='recall',  # Use a suitable scoring metric for your problem
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_undersampled,y_undersampled)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'learning_rate': 0.24, 'max_depth': 6, 'n_estimators': 200}
Test Accuracy: 0.9426028921023359


In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion Matrix
#conf_matrix = confusion_matrix(y_test, y_pred)

In [15]:
print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Test Accuracy: 0.9426028921023359
Precision: 0.7028647568287808
Recall: 0.9377777777777778
F1 Score: 0.8035034272658034


**MODEL XGBOOST WITH RANDOMSEARCH**


In [16]:


# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter distribution to search over
param_dist = {
    'n_estimators': randint(200, 400),  # Random integer values between 200 and 400
    'max_depth': randint(4, 7),          # Random integer values between 4 and 7
    'learning_rate': uniform(0.2, 0.3),  # Random float values between 0.2 and 0.3
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to try
    cv=5,       # Number of cross-validation folds
    scoring='recall',  # Use a suitable scoring metric for your problem
    verbose=1,
    random_state=42
)

# Fit the random search to the data
random_search.fit(X_undersampled, y_undersampled)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'learning_rate': 0.24286004537658223, 'max_depth': 6, 'n_estimators': 349}
Test Accuracy: 0.9454949944382648


In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Test Accuracy: 0.9454949944382648
Precision: 0.7115256495669554
Recall: 0.9493333333333334
F1 Score: 0.8134044173648133


**#model evaluation using metric precision:**

In [18]:

#######################################################3GridSearch 

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': range(200,400,50),
    'max_depth': range(4,7),
    'learning_rate': [0.2, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28],
    }

# Create the GridSearchCV object
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,  # Number of cross-validation folds
    scoring='precision',  # Use a suitable scoring metric for your problem
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_undersampled,y_undersampled)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 350}
Test Accuracy: 0.9429365962180201
Test Accuracy: 0.9429365962180201
Precision: 0.7034574468085106
Recall: 0.9404444444444444
F1 Score: 0.804868771395968


In [19]:
###################################random_search!!


# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter distribution to search over
param_dist = {
    'n_estimators': randint(200, 400),  # Random integer values between 200 and 400
    'max_depth': randint(4, 7),          # Random integer values between 4 and 7
    'learning_rate': uniform(0.2, 0.3),  # Random float values between 0.2 and 0.3
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to try
    cv=5,       # Number of cross-validation folds
    scoring='recall',  # Use a suitable scoring metric for your problem
    verbose=1,
    random_state=42
)

# Fit the random search to the data
random_search.fit(X_undersampled, y_undersampled)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'learning_rate': 0.24286004537658223, 'max_depth': 6, 'n_estimators': 349}
Test Accuracy: 0.9454949944382648
Test Accuracy: 0.9454949944382648
Precision: 0.7115256495669554
Recall: 0.9493333333333334
F1 Score: 0.8134044173648133


**MODEL TUNNING USING ACCURACY METRIC**


In [20]:
###learning rate 0.2, max_depth = 3, number estimators = 500

In [21]:
# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': range(200,400,50),
    'max_depth': range(4,7),
    'learning_rate': [0.2, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28],
    }

# Create the GridSearchCV object
grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,  # Number of cross-validation folds
    scoring='accuracy',  # Use a suitable scoring metric for your problem
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_undersampled,y_undersampled)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'learning_rate': 0.25, 'max_depth': 6, 'n_estimators': 350}
Test Accuracy: 0.9429365962180201
Test Accuracy: 0.9429365962180201
Precision: 0.7034574468085106
Recall: 0.9404444444444444
F1 Score: 0.804868771395968


**Fitted parameters***

In [22]:

# Fixed hyperparameters
learning_rate =  0.24286004537658223
max_depth = 6
n_estimators = 349

# Create an XGBoost classifier with fixed hyperparameters
xgb_model_fixed = xgb.XGBClassifier(
    learning_rate=learning_rate,
    max_depth=max_depth,
    n_estimators=n_estimators
)

# Fit the model to the undersampled data
xgb_model.fit(X_undersampled, y_undersampled)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy with Fixed Hyperparameters:", accuracy)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

#'learning_rate': 0.24286004537658223, 'max_depth': 6, 'n_estimators': 349

Test Accuracy with Fixed Hyperparameters: 0.9416017797552837
Test Accuracy: 0.9416017797552837
Precision: 0.6984126984126984
Recall: 0.9386666666666666
F1 Score: 0.800910125142207


**DEMO AUDIOS**

In [23]:
#FAKE AUDIO
#CLEANING DATA
#trying the demonstration 
path = os.getcwd()
csv_file_path = os.path.join(current_directory, 'audio_features_demo.csv')
# Read the CSV file into a DataFrame
df_demo = pd.read_csv(csv_file_path)
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
df_demo['LABEL'] = label_encoder.fit_transform(df_demo['LABEL'])

###########predicting linus to musk

# Display the encoded target variable
df_demo['LABEL'].value_counts()
X_demo = df_demo[df_demo['LABEL'] == 0].drop(columns=['LABEL', 'origin_sample'])
#y_demo = df_demo['LABEL']
#X_demo
##### XGBOOOST CODE

pd.DataFrame(xgb_model.predict(X=X_demo)).value_counts(normalize=True)


0    0.983333
1    0.016667
Name: proportion, dtype: float64

In [24]:
# Display the encoded target variable
#REAL AUDIO
df_demo['LABEL'].value_counts()
X_demo_2 = df_demo[df_demo['LABEL'] == 1].drop(columns=['LABEL', 'origin_sample'])
pd.DataFrame(xgb_model.predict(X=X_demo_2)).value_counts(normalize=True)

1    0.966667
0    0.033333
Name: proportion, dtype: float64

In [25]:
###new biden features creation and prediction ----------REAL AUDIO BIDEN

#trying the demonstration 
path = os.getcwd()
csv_file_path = os.path.join(current_directory, 'audio_features_biden.csv')
# Read the CSV file into a DataFrame
df_biden = pd.read_csv(csv_file_path)
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
df_biden['LABEL'] = label_encoder.fit_transform(df_biden['LABEL'])

###########predicting linus to musk

# Display the encoded target variable
df_biden['LABEL'].value_counts()
X_biden = df_biden.drop(columns=['LABEL', 'origin_sample'])
#X_biden
#y_demo = df_demo['LABEL']
#X_demo
##### XGBOOOST CODE

pd.DataFrame(xgb_model.predict(X=X_biden)).value_counts(normalize=True)

1    0.913889
0    0.086111
Name: proportion, dtype: float64

In [26]:
###new biden features creation and prediction ----------FAKE AUDIO BIDEN

#trying the demonstration 
path = os.getcwd()
csv_file_path = os.path.join(current_directory, 'audio_features_biden_AI.csv')
# Read the CSV file into a DataFrame
df_biden_AI = pd.read_csv(csv_file_path)
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
df_biden_AI['LABEL'] = label_encoder.fit_transform(df_biden_AI['LABEL'])

###########predicting linus to musk

# Display the encoded target variable
df_biden_AI['LABEL'].value_counts()
X_biden_AI = df_biden_AI.drop(columns=['LABEL', 'origin_sample'])
#X_biden
#y_demo = df_demo['LABEL']
#X_demo
##### XGBOOOST CODE

pd.DataFrame(xgb_model.predict(X=X_biden_AI)).value_counts(normalize=True)

0    0.8125
1    0.1875
Name: proportion, dtype: float64