In [1]:
import os
import pandas as pd

df = pd.read_csv('C:/Users/shriv/Downloads/Minor_Project/to_be_cleaned.csv')

In [2]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame with features and 'target' contains the labels (real/fake)
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Labels

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 1002
Testing samples: 251


In [6]:
mode_value = y.mode()[0]
y.fillna(mode_value, inplace=True)

# Re-define the features and target
X = df.drop(columns=['Label'])
y = df['Label']

X_train_cleaned = X_train[~y_train.isna()]
y_train_cleaned = y_train[~y_train.isna()]

# Now train the model with cleaned data
rf_model.fit(X_train_cleaned, y_train_cleaned)

In [4]:
print(y.isnull().sum())

0


In [4]:
# Remove rows where y_train has NaN values
X_train = X_train[~pd.isnull(y_train)]
y_train = y_train[~pd.isnull(y_train)]


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")


Random Forest Accuracy: 89.64%


In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'XGBoost': XGBClassifier(),
}

# Dictionary to store evaluation results
results = {}

for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    results[name] = {
        'cross_val_mean_f1': np.mean(cv_scores)
    }
    
    # Fit the model and make predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    results[name].update({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    })
    
    print(f"Model: {name}")
    print(f"Cross-validated F1 Score: {results[name]['cross_val_mean_f1']:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {roc_auc:.4f}")
    print("-" * 50)

# You can also create a DataFrame to summarize the results
import pandas as pd

results_df = pd.DataFrame(results).T
print(results_df)



Model: Random Forest
Cross-validated F1 Score: 0.7886
Accuracy: 0.9203
Precision: 0.8000
Recall: 0.8000
F1 Score: 0.8000
AUC-ROC: 0.9473
--------------------------------------------------
Model: Logistic Regression
Cross-validated F1 Score: 0.8256
Accuracy: 0.9044
Precision: 0.7500
Recall: 0.7800
F1 Score: 0.7647
AUC-ROC: 0.9380
--------------------------------------------------
Model: SVM
Cross-validated F1 Score: 0.5573
Accuracy: 0.8606
Precision: 0.8000
Recall: 0.4000
F1 Score: 0.5333
AUC-ROC: 0.8906
--------------------------------------------------
Model: XGBoost
Cross-validated F1 Score: 0.7654
Accuracy: 0.9044
Precision: 0.7500
Recall: 0.7800
F1 Score: 0.7647
AUC-ROC: 0.9291
--------------------------------------------------
                     cross_val_mean_f1  accuracy  precision  recall  f1_score  \
Random Forest                 0.788592  0.920319       0.80    0.80  0.800000   
Logistic Regression           0.825555  0.904382       0.75    0.78  0.764706   
SVM            

In [8]:
from sklearn.model_selection import RandomizedSearchCV

# Set up the parameter distribution for Random Forest
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2', None],  # Replace 'auto' with 'sqrt' or 'log2'
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]

}

# Initialize RandomizedSearchCV
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_dist, 
                                       n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit the random search to the data
rf_random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters for Random Forest (Randomized Search):", rf_random_search.best_params_)
print("Best Score for Random Forest (Randomized Search):", rf_random_search.best_score_)

# Predictions
rf_random_best_model = rf_random_search.best_estimator_
y_pred_rf_random = rf_random_best_model.predict(X_test)

# Evaluation
print("Random Forest Accuracy (Randomized Search):", accuracy_score(y_test, y_pred_rf_random))
print(classification_report(y_test, y_pred_rf_random))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for Random Forest (Randomized Search): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
Best Score for Random Forest (Randomized Search): 0.908
Random Forest Accuracy (Randomized Search): 0.9203187250996016
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       201
         1.0       0.80      0.80      0.80        50

    accuracy                           0.92       251
   macro avg       0.88      0.88      0.88       251
weighted avg       0.92      0.92      0.92       251



In [9]:
import joblib

# Save the best model after hyperparameter tuning
joblib.dump(rf_random_best_model, 'random_forest_model.pkl')

['random_forest_model.pkl']