In [1]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the Excel file into a pandas dataframe
df = pd.read_excel('C:\\Users\\Amin\\Desktop\\Dataset.xlsx')

# Define the function to remove non-letter characters, punctuation marks, and stop words
def clean_text(text):

    # Convert to lowercase
    text = text.lower()

    # Remove non-letter characters and punctuation
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Perform stemming on the tokens
    porter = nltk.PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]

    return tokens

# Apply the function to the 'Text' column of the dataframe
df['Text'] = df['Text'].apply(clean_text)

# Save the cleaned dataframe to a CSV file
df.to_excel('C:\\Users\\Amin\\Desktop\\Cleaned Dataset.xlsx', index=False)

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report

# Read and clean the dataset
# Assuming your dataset is in a variable called 'df'

# Extract emotion features using VADER sentiment analysis
sia = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['Text'].apply(lambda x: sia.polarity_scores(' '.join(x)))
df['compound'] = df['sentiment_scores'].apply(lambda x: x['compound'])

# Extract statistical features
df['text_length'] = df['Text'].apply(len)
df['average_word_length'] = df['Text'].apply(lambda x: np.mean([len(word) for word in x]) if x else 0)

# Combine emotion and statistical features into a single DataFrame
feature_df = df[['compound', 'text_length', 'average_word_length']]

# Standardize features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(feature_df)

# Get the target labels
# Assuming your target label is in a column named 'Class'
y = df['Class'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier using Scikit-learn
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))
print(classification_report(y_test, y_pred))


Test Accuracy: 74.28%
              precision    recall  f1-score   support

 non-suicide       0.70      0.87      0.77     23287
     suicide       0.82      0.62      0.71     23128

    accuracy                           0.74     46415
   macro avg       0.76      0.74      0.74     46415
weighted avg       0.76      0.74      0.74     46415



In [4]:
import joblib

joblib.dump(clf, ' support_vector_machine_model.pkl')

[' support_vector_machine_model.pkl']