# Weiyang Linear SVM

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("../Data/ML_train_sample_ada.csv")
test = pd.read_csv("../Data/ML_test_no_sample.csv")

In [3]:
train_num_char_benefits_min_value = train["num_char_benefits"].min()
train_num_char_benefits_max_value = train["num_char_benefits"].max()
train_num_words_benefits_min_value = train["num_words_benefits"].min()
train_num_words_benefits_max_value = train["num_words_benefits"].max()
train_avg_word_length_benefits_min_value = train["avg_word_length_benefits"].min()
train_avg_word_length_benefits_max_value = train["avg_word_length_benefits"].max()

train["minmax_num_char_benefits"] = (train["num_char_benefits"] - train_num_char_benefits_min_value) / (train_num_char_benefits_max_value - train_num_char_benefits_min_value)
train['minmax_num_words_benefits'] = (train["num_words_benefits"] - train_num_words_benefits_min_value) / (train_num_words_benefits_max_value - train_num_words_benefits_min_value)
train['minmax_avg_word_length_benefits'] = (train["avg_word_length_benefits"] - train_avg_word_length_benefits_min_value) / (train_avg_word_length_benefits_max_value - train_avg_word_length_benefits_min_value)

train = train.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

test = test.drop(columns = ["Country", "City", "State", "sentence_length"])
test_num_char_benefits_min_value = test["num_char_benefits"].min()
test_num_char_benefits_max_value = test["num_char_benefits"].max()
test_num_words_benefits_min_value = test["num_words_benefits"].min()
test_num_words_benefits_max_value = test["num_words_benefits"].max()
test_avg_word_length_benefits_min_value = test["avg_word_length_benefits"].min()
test_avg_word_length_benefits_max_value = test["avg_word_length_benefits"].max()

test["minmax_num_char_benefits"] = (test["num_char_benefits"] - test_num_char_benefits_min_value) / (test_num_char_benefits_max_value - test_num_char_benefits_min_value)
test['minmax_num_words_benefits'] = (test["num_words_benefits"] - test_num_words_benefits_min_value) / (test_num_words_benefits_max_value - test_num_words_benefits_min_value)
test['minmax_avg_word_length_benefits'] = (test["avg_word_length_benefits"] - test_avg_word_length_benefits_min_value) / (test_avg_word_length_benefits_max_value - test_avg_word_length_benefits_min_value)

test = test.drop(columns = ["num_char_benefits", "num_words_benefits", "avg_word_length_benefits"])

In [4]:
X_train = train.drop(["fraudulent"], axis = 1)
y_train = train["fraudulent"]

X_test = test.drop(["fraudulent"], axis = 1)
y_test = test["fraudulent"]

## LinearSVM

In [5]:
param_dist = {
    'C': np.logspace(-3, 3, 100),  # Regularization parameter in a logarithmic range
    'tol': np.logspace(-6, -2, 100),  # Tolerance for stopping criteria
    'dual': [True, False],
    'fit_intercept': [True, False],  # Whether to calculate the intercept
    'max_iter': [2000, 5000, 10000],  # Maximum number of iterations
    'penalty': ['l1', 'l2'],  # Type of regularization penalty
}

In [6]:
svm = LinearSVC()
random_search_svm = RandomizedSearchCV(svm, param_distributions=param_dist, scoring = 'recall', n_iter=10, cv=5, n_jobs=-1)

In [7]:
random_search_svm.fit(X_train, y_train)

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\seewe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\seewe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\seewe\AppData\Local\Packages\PythonSoftwareFoundation.

In [8]:
best_params_svm = random_search_svm.best_params_
best_svm = random_search_svm.best_estimator_

In [9]:
y_pred_svm = best_svm.predict(X_test)

In [10]:
def get_overall_evaluation_score(y_true, y_pred):
    #accuracy
    print('Accuracy score: ', accuracy_score(y_true, y_pred))
    #precision
    print('Precision score: ', precision_score(y_true, y_pred, pos_label=1))
    #recall
    print('Recall score: ', recall_score(y_true, y_pred, pos_label=1))
    #f1
    print('F1 score: ', f1_score(y_true, y_pred, pos_label=1))
    #log loss
    # print('Logarithmic Loss: ', log_loss(y_true, y_pred_prob))
    #confusion matix
    confusion = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix: ")
    print(confusion)

In [11]:
print(best_params_svm)
get_overall_evaluation_score(y_test, y_pred_svm)

{'tol': 4.430621457583877e-06, 'penalty': 'l2', 'max_iter': 2000, 'fit_intercept': False, 'dual': False, 'C': 657.9332246575682}
Accuracy score:  0.9554436987322893
Precision score:  0.5426356589147286
Recall score:  0.5363984674329502
F1 score:  0.5394990366088632
Confusion Matrix: 
[[4985  118]
 [ 121  140]]


## XGBoost

In [30]:
xgb_model = XGBClassifier()

In [31]:
# param_dist = {
#     'n_estimators': [ 200, 300],  # Number of boosting rounds
#     'max_depth': [ 11, 12],  # Maximum tree depth
#     'learning_rate': [ 0.5, 0.6],  # Step size shrinkage used in update
#     'subsample': [1.0],  # Fraction of samples used for training
#     'colsample_bytree': [1.0],  # Fraction of features used for training
# }

# # Initialize RandomizedSearchCV with the XGBoost classifier and hyperparameter grid
# random_search_xg = RandomizedSearchCV(
#     xgb_model, param_distributions=param_dist, n_iter=5, cv=5, n_jobs=-1, scoring='f1', random_state=42)

# # Perform the random search to find the best hyperparameters
# random_search_xg.fit(X_train, y_train)  # X and y are your feature matrix and target variable

# # Get the best hyperparameters and the best estimator
# best_params_xg = random_search_xg.best_params_
# best_xgb = random_search_xg.best_estimator_

# # Evaluate the model with the best hyperparameters
# y_pred_xgboost = best_xgb.predict(X_test)

In [32]:
# print(best_params_xg)
# get_overall_evaluation_score(y_test, y_pred_xgboost)

## Random Forest Classifier

In [12]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 800, num = 5)]
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [14]:
rfc = RandomForestClassifier(n_estimators=50, n_jobs = -1)

random_search_rfc = RandomizedSearchCV(
    rfc, param_distributions=random_grid, n_iter=20, cv=5, n_jobs=-1, scoring='recall', random_state=42)

random_search_rfc.fit(X_train, y_train)

In [15]:
best_params_rfc = random_search_rfc.best_params_
best_rf = random_search_rfc.best_estimator_


rfc_y_train = best_rf.predict(X_train)
rfc_y_test = best_rf.predict(X_test)

In [16]:
print(best_params_rfc)
get_overall_evaluation_score(y_train, rfc_y_train)
get_overall_evaluation_score(y_test, rfc_y_test)

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
Accuracy score:  0.9995993589743589
Precision score:  0.9980462390100945
Recall score:  1.0
F1 score:  0.9990221642764017
Confusion Matrix: 
[[11905     6]
 [    0  3065]]
Accuracy score:  0.9865771812080537
Precision score:  0.8648648648648649
Recall score:  0.8582375478927203
F1 score:  0.8615384615384615
Confusion Matrix: 
[[5068   35]
 [  37  224]]
