In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [11]:
# Read data from csv file
df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/Original_SuicideDetection.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [12]:
from collections import Counter
from nltk.corpus import stopwords
import ssl
import string
ssl._create_default_https_context = ssl._create_unverified_context

# Download stopwords
nltk.download('stopwords')

# Load stopwords from NLTK
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Drop the first column 'Unnamed: 0'
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [14]:
# Perform data cleaning 

# Remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove HTML tags
df['text'] = df['text'].apply(lambda x: re.sub(r'<.*?>', '', x))

# Remove Emojis
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove non-ASCII characters
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\x00-\x7f]', '', x))

# Remove numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove all items starting with @
df['text'] = df['text'].apply(lambda x: re.sub(r'@\w+', '', x))

# Remove all items starting with #
df['text'] = df['text'].apply(lambda x: re.sub(r'#\w+', '', x))

# Remove all punctuations
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove all stopwords
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

# Convert all text to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# Ensuring only strings are present in the tweet_column
df = df[df['text'].apply(lambda x: isinstance(x, str))]


In [15]:
print(df.shape)

(232074, 2)


In [16]:
df.head()

Unnamed: 0,text,class
0,ex wife threatening suiciderecently i left wif...,suicide
1,am i weird i dont get affected compliments com...,non-suicide
2,finally almost so i never hear bad year ever i...,non-suicide
3,need helpjust help im crying hard,suicide
4,im losthello name adam ive struggling years im...,suicide


In [17]:
train_df = df.copy()
train_df.head()

Unnamed: 0,text,class
0,ex wife threatening suiciderecently i left wif...,suicide
1,am i weird i dont get affected compliments com...,non-suicide
2,finally almost so i never hear bad year ever i...,non-suicide
3,need helpjust help im crying hard,suicide
4,im losthello name adam ive struggling years im...,suicide


In [19]:
# Preprocessing the data
# Removing the stopwords
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Stemming
stemmer = PorterStemmer()
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Tokenization
tokenizer = TweetTokenizer()
train_df['text'] = train_df['text'].apply(lambda x: tokenizer.tokenize(x))

In [20]:
# Convert the list of tokens into a string
train_df['text'] = train_df['text'].apply(lambda x: ' '.join(x))

In [21]:
train_df.head()

Unnamed: 0,text,class
0,ex wife threaten suiciderec left wife good che...,suicide
1,weird dont get affect compliment come someon k...,non-suicide
2,final almost never hear bad year ever swear fu...,non-suicide
3,need helpjust help im cri hard,suicide
4,im losthello name adam ive struggl year im afr...,suicide


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
train_vectors = vectorizer.fit_transform(train_df['text'])

In [23]:
print(train_vectors.shape)

(232074, 1000)


In [28]:
# Get the labels
train_labels = train_df['class']

# Convert the labels to numpy array
train_labels = train_labels.to_numpy()

print(train_labels.shape)
print(train_vectors.shape)

(232074,)
(232074, 1000)


In [30]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
lr_best = LogisticRegression(max_iter=1000, C=1, penalty='l2', solver= 'newton-cg')

# Fit the model
lr_best.fit(train_vectors, train_labels)

# Save the model
import pickle
filename = 'finalized_model.sav'
pickle.dump(lr_best, open(filename, 'wb'))

In [None]:
# Save the vectorizer
filename = 'vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))