In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/NepaliDataSet/Cleaned_Nepali_dataset.csv")
dataset.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1


In [None]:
# Load the dataset
dataset = pd.read_csv("/content/drive/MyDrive/NepaliDataSet/Cleaned_Nepali_dataset.csv")  # Update with your dataset path

# Extract the text and labels
X_raw = dataset['Text']  # Replace 'text_column' with your text column name
y = dataset['Target']     # Replace 'label_column' with your label column name

print(f"Number of samples: {len(X_raw)}")
print(f"Classes: {set(y)}")


Number of samples: 2859
Classes: {0, 1, 2, 3, 4, 5}


In [None]:
# Load the saved TF-IDF vectorizer
tfidf_vectorizer = joblib.load("/content/drive/MyDrive/NepaliDataSet/tfidf_vectorizer.joblib")  # Update with your vectorizer path

print("TF-IDF vectorizer loaded successfully.")


TF-IDF vectorizer loaded successfully.


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Transform the raw text data to TF-IDF features
X = tfidf_vectorizer.transform(X_raw)

print(f"TF-IDF features shape: {X.shape}")


TF-IDF features shape: (2859, 8589)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (2287, 8589)
Testing set size: (572, 8589)


In [None]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
svm_classifier.fit(X_train, y_train)

print("SVM training complete.")


SVM training complete.


In [None]:
# Make predictions
y_pred = svm_classifier.predict(X_test)


In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.63
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.68      0.75       180
           1       0.61      0.86      0.71       272
           2       0.10      0.04      0.06        50
           3       0.43      0.19      0.26        16
           4       0.00      0.00      0.00        22
           5       0.00      0.00      0.00        32

    accuracy                           0.63       572
   macro avg       0.33      0.30      0.30       572
weighted avg       0.57      0.63      0.59       572

Confusion Matrix:
[[123  52   2   1   2   0]
 [ 17 234  13   2   1   5]
 [  0  45   2   0   1   2]
 [  0  13   0   3   0   0]
 [  2  18   2   0   0   0]
 [  5  24   2   1   0   0]]


Using Oversampling technique SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split

# Split the resampled dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (6518, 8589)
Testing set size: (1630, 8589)


In [None]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the model
svm_classifier.fit(X_train, y_train)

print("SVM model training complete.")


SVM model training complete.


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       271
           1       0.84      0.70      0.76       271
           2       0.87      0.89      0.88       272
           3       0.96      1.00      0.98       272
           4       0.91      1.00      0.95       272
           5       0.89      0.96      0.93       272

    accuracy                           0.90      1630
   macro avg       0.90      0.90      0.90      1630
weighted avg       0.90      0.90      0.90      1630

Confusion Matrix:
[[231  32   3   0   2   3]
 [ 22 189  29   7  10  14]
 [  0   2 241   0  14  15]
 [  0   0   0 272   0   0]
 [  0   0   0   1 271   0]
 [  0   1   5   4   0 262]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# Use the best estimator for predictions
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validation Score: 0.8952117341293568
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.88      0.87      0.88       271
           1       0.85      0.68      0.76       271
           2       0.89      0.90      0.90       272
           3       0.96      1.00      0.98       272
           4       0.92      1.00      0.96       272
           5       0.90      0.97      0.94       272

    accuracy                           0.90      1630
   macro avg       0.90      0.90      0.90      1630
weighted avg       0.90      0.90      0.90      1630



Model Training after hyperparameter tuning

In [None]:
# Train the optimized SVM model
optimized_svm = SVC(C=10, kernel='linear', gamma='scale', random_state=42)
optimized_svm.fit(X_train, y_train)

print("Optimized SVM model training complete.")


Optimized SVM model training complete.


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions on the test set
y_pred = optimized_svm.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88       271
           1       0.85      0.68      0.76       271
           2       0.89      0.90      0.90       272
           3       0.96      1.00      0.98       272
           4       0.92      1.00      0.96       272
           5       0.90      0.97      0.94       272

    accuracy                           0.90      1630
   macro avg       0.90      0.90      0.90      1630
weighted avg       0.90      0.90      0.90      1630

Confusion Matrix:
[[235  28   3   0   2   3]
 [ 30 185  25   6   9  16]
 [  1   3 245   0  13  10]
 [  0   0   0 272   0   0]
 [  0   0   0   0 272   0]
 [  0   1   2   4   0 265]]


In [None]:
import joblib

# Save the optimized SVM model
joblib.dump(optimized_svm, "optimized_svm_classifier.joblib")
print("Optimized SVM model saved successfully.")


Optimized SVM model saved successfully.


In [None]:
# Load the optimized SVM model
loaded_model = joblib.load("optimized_svm_classifier.joblib")

# Example: Make predictions on new data
new_data = X_test[:5]  # Replace with your new data
predictions = loaded_model.predict(new_data)

print(f"Predictions: {predictions}")


Predictions: [5 3 2 4 1]


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(optimized_svm, X_resampled, y_resampled, cv=5, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}")


Cross-Validation Scores: [0.87607362 0.88466258 0.89631902 0.91282996 0.92633517]
Mean CV Accuracy: 0.90


Testing the model

In [None]:
# Load the saved TF-IDF vectorizer
tfidf_vectorizer = joblib.load("/content/drive/MyDrive/NepaliDataSet/tfidf_vectorizer.joblib")

# Example unseen raw data
unseen_raw_data = [
    "तिमीहरु सबै कामचोर हो, यो देश बिगार्ने तिमीहरुकै कारण हो।",  # Hate
    "महिलाहरुले के गर्न सक्छन्? तिनीहरु कमजोर हुन्छन्।",            # Hate
    "तँलाई यो काम केही पनि थाहा छैन, मुर्ख!",                        # Profanity
    "नेताहरु भ्रष्ट छन्, तिनीहरु सबै झुट बोल्ने छन्।",                # Hate
    "तिमी जस्तो मुर्खलाई यो देशमा के गर्न दिने?",                   # Hate
    "तेरो दिमाग छैन कि के हो?",                                     # Profanity
    "आजको मौसम कस्तो राम्रो छ!",                                   # Neutral
    "म बिहान स्कूल जान्छु र साँझ घर फर्किन्छु।",                    # Neutral
    "तँ सधैं ढिलो गर्छस्, के कामका लागि लायक छैनस्?",               # Profanity
    "तिमीले राम्रो काम गरेको छौ, तिमीमाथि गर्व छ।"
]

# Transform the unseen data
X_unseen = tfidf_vectorizer.transform(unseen_raw_data)


In [None]:
# Load the trained SVM model
optimized_svm = joblib.load("optimized_svm_classifier.joblib")

# Predict on unseen data
unseen_predictions = optimized_svm.predict(X_unseen)

# Display predictions
print(f"Predictions for unseen data: {unseen_predictions}")


Predictions for unseen data: [1 1 1 5 1 1 1 1 0 0]


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Example ground truth labels
# Example ground truth labels (update according to your data categories)
y_unseen = [1, 1, 2, 1, 1, 2, 0, 0, 2, 0]  # Replace with actual labels

# Evaluate the model
print("Classification Report for Unseen Data:")
print(classification_report(y_unseen, unseen_predictions))

# Accuracy
accuracy = accuracy_score(y_unseen, unseen_predictions)
print(f"Accuracy on unseen data: {accuracy:.2f}")


Classification Report for Unseen Data:
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.43      0.75      0.55         4
           2       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         0

    accuracy                           0.40        10
   macro avg       0.23      0.27      0.24        10
weighted avg       0.32      0.40      0.34        10

Accuracy on unseen data: 0.40


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
