## Model Training

In [None]:
import pandas as pd

X = pd.read_csv('svd_df.csv')
y_ = pd.read_csv('tokenized.csv')['target']

In [None]:
# Convert classes: 0 remains 0, and 4 becomes 1

y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y = reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_test,y))

print("RMSE:", rmse)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the model
model = LogisticRegression(solver='saga', max_iter=10000)

# Define a grid of parameters to search over
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Setup the grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
y_ = grid_search.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_)
recall = recall_score(y_test, y_)

precision, recall

In [None]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_)
roc_auc = auc(fpr, tpr)

# Create an interactive plot
fig = go.Figure()

# Add Traces
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve',
                         line=dict(color='darkorange'),
                         showlegend=True))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Chance',
                         line=dict(color='navy', dash='dash'),
                         showlegend=False))

# Add AUC in the legend
fig.update_layout(title=f'ROC Curve (AUC = {roc_auc:.2f})',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate',
                  xaxis=dict(showgrid=False),
                  yaxis=dict(showgrid=False),
                  template="plotly_white")

# Show figure
fig.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [None]:
# Define the logistic regression model using 'liblinear' solver
model = LogisticRegression(solver='saga', random_state=42, max_iter=10000, tol=1e-4)

# Define a grid of hyperparameter values to search over
param_grid = {
    'C': [ 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']  # liblinear supports both L1 and L2 regularization
}

# Define the AUC scoring function
auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)

# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring=auc_scorer, cv=5, verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# After fitting, you can check the best parameters and the best AUC score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best AUC score: {grid_search.best_score_}")

# Optionally, evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_ = best_model.predict_proba(X_test)[:, 1]  # Get probability estimates of the positive class
test_auc = roc_auc_score(y_test, y_)

print(f"Test AUC score: {test_auc}")

In [None]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_)
roc_auc = auc(fpr, tpr)

# Create an interactive plot
fig = go.Figure()

# Add Traces
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve',
                         line=dict(color='darkorange'),
                         showlegend=True))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Chance',
                         line=dict(color='navy', dash='dash'),
                         showlegend=False))

# Add AUC in the legend
fig.update_layout(title=f'ROC Curve (AUC = {roc_auc:.2f})',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate',
                  xaxis=dict(showgrid=False),
                  yaxis=dict(showgrid=False),
                  template="plotly_white")

# Show figure
fig.show()