In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyodbc 
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, make_scorer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from xgboost import XGBClassifier
from collections import Counter
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords  

# Downloading NLTK punkt package
nltk.download('punkt')

# Establishing SQL connection
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=SERVER;'
                      'Database=DB;'
                      'user=USER;'
                      'Trusted_Connection=yes;')

# Reading data from SQL server
sql_query = pd.read_sql_query('SELECT * FROM ECHO_DATABASE', conn)
df = pd.DataFrame(sql_query, columns=['Findings_Value', 'SEVERITY'])
print(df)

# Data preprocessing: Removing numbers, non-word characters, and extra spaces from text
def preprocess_text(text):
    text = re.sub(r'\[[0-9]*\]', ' ', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)          # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)         # Remove extra spaces
    return text

df['Findings_Value'] = df['Findings_Value'].apply(lambda x: preprocess_text(str(x)))

# Tokenization and removal of stopwords
stop_words = set(stopwords.words('english'))  

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)  
    return [w for w in word_tokens if not w in stop_words]

df['Findings_Value'] = df['Findings_Value'].apply(remove_stopwords)

# Preparing data for modeling
X = df['Findings_Value']
y = df['SEVERITY']

# Text vectorization using Bag of Words and TF-IDF
count_vect = CountVectorizer(min_df=1, max_df=1.0)
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Label encoding
y_encoded = LabelEncoder().fit_transform(y)

# Handling class imbalance with SMOTE and Edited Nearest Neighbours
smote_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
X_tfidf, y_new = smote_enn.fit_resample(X_tfidf, y_encoded)
counterx = Counter(y_new)
print('Class distribution:', counterx)

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_new, test_size=0.33, random_state=42)

# Function to perform training and evaluation of a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('Model Results:')
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    accuracy = accuracy_score(y_test, predictions)
    print(f'Accuracy: {accuracy:.2f}')

# Training and evaluating Random Forest, SVM, and XGBoost classifiers
train_evaluate_model(RandomForestClassifier(), X_train, y_train, X_test, y_test)
train_evaluate_model(LinearSVC(), X_train, y_train, X_test, y_test)
train_evaluate_model(XGBClassifier(), X_train, y_train, X_test, y_test)

# Function to perform stratified 5-fold cross-validation
def stratified_cross_validation(model, X, y):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
    print(f'5-Fold CV Scores: {scores}')
    print(f'Average Score: {np.mean(scores):.2f}')

# Stratified 5-fold cross-validation for Random Forest, SVM, and XGBoost classifiers
stratified_cross_validation(RandomForestClassifier(random_state=42), X_tfidf, y_new)
stratified_cross_validation(LinearSVC(), X_tfidf, y_new)
stratified_cross_validation(XGBClassifier(), X_tfidf, y_new)