In [14]:
import pandas as pd
import numpy as np 
import os
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, auc
from sklearn.ensemble import RandomForestClassifier
import time
from joblib import Parallel, delayed
from functools import partial
from multiprocessing import Pool
import joblib


###  Estimators

#### XGBOOST

In [4]:
def get_train_test(complete_df):
    #split, process and return  X_train, X_test, y_train, y_test
    df_train, df_test = train_test_split(complete_df, test_size=0.3,stratify=complete_df['LABEL'])
    
    count = df_train['LABEL'].value_counts()
    count[count.isin([min(count)])].index[0]
    under_sampled = df_train[df_train['LABEL']==count[count.isin([max(count)])].index[0]].sample(min(count))
    balanced_df = pd.concat([under_sampled, df_train[df_train['LABEL']==count[count.isin([min(count)])].index[0]]], axis=0)
    X_train = balanced_df[balanced_df.columns[0:-1]]
    X_test = df_test[balanced_df.columns[0:-1]]
    y_label = balanced_df['LABEL']
    y_label_test = df_test['LABEL']
    le = LabelEncoder()
    le.fit(y_label)
    # le.classes_
    y_train = le.transform(y_label)
    y_test = le.transform(y_label_test)
    return X_train, X_test, y_train, y_test



In [22]:
def test_model(svm_XGB_best2):
    #test model with our tests
    
    #read tests
    new_test_taylor_r = pd.read_csv('Data/extracted_segment_10_extracted_segment_Taylor Swift Talks Record-Breaking Midnights Album, Music Video Cameos and Easter Eggs.csv')
    new_test_bidenf = pd.read_csv('Data/audio_features_biden_AI.csv')
    new_test_bidenr = pd.read_csv('Data/audio_features_biden.csv')
    new_test_el = pd.read_csv('Data/Elise.csv')
    new_test_yr1 = pd.read_csv('Data/Youssef.csv')
    new_test_yr2 = pd.read_csv('Data/Youssef_2.csv')
    new_test_yf1 = pd.read_csv('Data/Youssef_Eric Cartman.csv')
    new_test_yf2 = pd.read_csv('Data/Youssef_2_Female.csv')
    new_test_yf3  = pd.read_csv('Data/Youssef_2_Male Reggaeton.csv')
    new_test_MF_f  = pd.read_csv('Data/Morgan Freeman_fake.csv')
    new_test_MF_r  = pd.read_csv('Data/Morgan Freeman_real.csv')
    new_test_And_r = pd.read_csv('Data/Andrea.csv')
    new_test_And_f = pd.read_csv('Data/Andrea_to_Youss.csv')
    
    #make y_true
    y_real = np.ones((1000,))
    y_fake = np.zeros((1000,))
    
    #predict and get accuray score
    pred = svm_XGB_best2.predict(new_test_taylor_r)
    taylor_r = accuracy_score(y_real[:len(pred)],pred)
    print('Real Taylor : ' , taylor_r)
    
    pred = svm_XGB_best2.predict(new_test_bidenf[new_test_bidenf.columns[0:-2]])
    bidenf = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Biden : ' , bidenf)

    pred = svm_XGB_best2.predict(new_test_bidenr[new_test_bidenf.columns[0:-2]])
    bidenr=accuracy_score(y_real[:len(pred)],pred)
    print('Real Biden : ' , bidenr)

    pred = svm_XGB_best2.predict(new_test_el)
    eliser=accuracy_score(y_real[:len(pred)],pred)
    print('Real Elise : ' , eliser)

    pred = svm_XGB_best2.predict(new_test_yr1)
    youssr1 = accuracy_score(y_real[:len(pred)],pred)
    print('Real Youss_1 : ' , youssr1)

    pred = svm_XGB_best2.predict(new_test_yr2)
    youssr2 = accuracy_score(y_real[:len(pred)],pred)
    print('Real Youss_2 : ' , youssr2)

    pred = svm_XGB_best2.predict(new_test_yf1)
    youssf1 = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Youss_1 : ' , youssf1)

    pred = svm_XGB_best2.predict(new_test_yf2)
    youssf2 = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Youss_2 : ' , youssf2)

    pred = svm_XGB_best2.predict(new_test_yf3)
    youssf3 = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Youss_3 : ' , youssf3)

    pred = svm_XGB_best2.predict(new_test_MF_f)
    morganf = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Morgan : ' , morganf)

    pred = svm_XGB_best2.predict(new_test_MF_r)
    morganr = accuracy_score(y_real[:len(pred)],pred)
    print('Real Morgan : ' , morganr)

    pred = svm_XGB_best2.predict(new_test_And_f)
    andf = accuracy_score(y_fake[:len(pred)],pred)
    print('Fake Andrea : ' , andf)

    pred = svm_XGB_best2.predict(new_test_And_r)
    andr = accuracy_score(y_real[:len(pred)],pred)
    print('Real Andrea : ' , andr)

    

In [5]:
complete_df = pd.read_csv('Data/New_features.csv')

X_train, X_test, y_train, y_test = get_train_test(complete_df)


In [6]:
y_test.shape

(8981,)

In [9]:
# Instantiate model
model_xgb = XGBClassifier(max_depth=10, 
                         n_estimators=300, 
                         learning_rate=0.1)

score_accuracy = cross_val_score(model_xgb,X_train,y_train,cv=10,scoring='accuracy',n_jobs=-1)
score_accuracy

array([0.9351145 , 0.9351145 , 0.94465649, 0.95419847, 0.94274809,
       0.96374046, 0.94274809, 0.9389313 , 0.93690249, 0.93690249])

In [None]:
#grid searching best params with randomized search
grid = {'n_estimators': randint(1, 400),
        'max_depth': randint(1, 13),  
        'learning_rate': loguniform (0.001, 0.5)
        # 'max_leaves' : 
       }
bst = XGBClassifier()
# Instantiate Grid Search
search = RandomizedSearchCV(
    bst,
    grid, 
    scoring='precision',
    n_iter=100,  
    cv=5, n_jobs=-1
)

search.fit(X_train, y_train)
search.best_estimator_

In [13]:
#grid searching best params
grid = {'n_estimators': [200,209,300,350,400],
        'max_depth': range(5,8),  
        'learning_rate': [0.015363601183606038, 0.02, 0.2]
       }
bst = XGBClassifier()
# # Instantiate Grid Search
search = GridSearchCV(
     bst,
     grid, 
     scoring='precision',
     cv=5, n_jobs=-1
)
search.fit(X_train, y_train)
search.best_estimator_


In [16]:
svm_XGB_best = search.best_estimator_

cross_val_score(svm_XGB_best,X_train,y_train,cv=10,scoring='accuracy',n_jobs=-1)

array([0.9389313 , 0.9370229 , 0.94465649, 0.96183206, 0.94274809,
       0.95610687, 0.94083969, 0.92175573, 0.94455067, 0.93881453])

In [17]:
cross_val_score(svm_XGB_best,X_train,y_train,cv=10,scoring='precision',n_jobs=-1)

array([0.93560606, 0.92250923, 0.9330855 , 0.95833333, 0.92335766,
       0.96138996, 0.92619926, 0.92664093, 0.93939394, 0.94230769])

In [18]:
svm_XGB_best.fit(X_train,y_train)
y_pred_best = svm_XGB_best.predict(X_test)

y_pred_best

array([0, 1, 0, ..., 0, 0, 0])

In [19]:
accuracy_score(y_test,y_pred_best)

0.9459971049994432

In [20]:
precision_score(y_test,y_pred_best)

0.7132352941176471

### Testing additional voices

In [24]:
test_model(svm_XGB_best)

Real Taylor :  0.3
Fake Biden :  0.775
Real Biden :  0.8305555555555556
Real Elise :  0.08695652173913043
Real Youss_1 :  0.0
Real Youss_2 :  0.9833333333333333
Fake Youss_1 :  1.0
Fake Youss_2 :  1.0
Fake Youss_3 :  0.016666666666666666
Fake Morgan :  0.7142857142857143
Real Morgan :  0.5172413793103449
Fake Andrea :  1.0
Real Andrea :  0.0
