In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [4]:
df = pd.read_csv('Suicide_Detection_Cleaned.csv')

In [5]:
df = df.dropna()

In [6]:
df['class'][df['class'] == 'suicide'] = 1
df['class'][df['class'] == 'non-suicide'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'][df['class'] == 'suicide'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'][df['class'] == 'non-suicide'] = 0


In [7]:
df['class'] = df['class'].astype(int)

# Random Forest with Count Vectorizer

In [8]:
vectorizer = CountVectorizer(min_df=3)
X = vectorizer.fit_transform(df['clean_text'].head(10000))
X = X.tocsc() 
y = df['class'].head(10000)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [10]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [11]:
n_est = [int(n) for n in np.logspace(start=2.9, stop=4, num=5)]
depth = [int(x) for x in np.linspace(10, 100, num = 10)]
param_grid = {'n_estimators': n_est, 'max_depth' : depth}
rf1 = RandomForestClassifier(random_state = 1519)
rf_cv = GridSearchCV(rf1, param_grid, cv = 5)
rf_cv.fit(X_train, y_train)

In [12]:
print("Best Score:" + str(rf_cv.best_score_))
print("Best Parameters: " + str(rf_cv.best_params_))

Best Score:0.8682666666666667
Best Parameters: {'max_depth': 90, 'n_estimators': 794}


# Random Forest with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizor_tfid = TfidfVectorizer(min_df=0)
X_tfid = vectorizor_tfid.fit_transform(df['text'])
X_tfid = X_tfid.tocsc()  # some versions of sklearn return COO format
y_tfid = df['class']

In [None]:
X_train_tfid, X_test_tfid, y_train_tfid, y_test_tfid = train_test_split(X_tfid, y_tfid, test_size=0.25, random_state=12)

In [None]:
rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(X_train_tfid, y_train_tfid)

In [None]:
n_est = [int(n) for n in np.logspace(start=2.9, stop=4, num=5)]
depth = [int(x) for x in np.linspace(10, 100, num = 10)]
param_grid = {'n_estimators': n_est, 'max_depth' : depth}
rf1 = RandomForestClassifier(random_state = 1519)
rf_tfid_cv = GridSearchCV(rf1, param_grid, cv = 5, random_state = 123)
rf_tfid_cv.fit(X_train_tfid, y_train_tfid)

In [None]:
print("Best Score:" + str(rf_tfid.best_score_))
print("Best Parameters: " + str(rf_tfid_cv.best_params_))