In [1]:
import pandas as pd
import pickle
import re
import numpy as np
from tqdm import tqdm
import nltk


In [2]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/marcel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

In [4]:
def preprocess_tweet(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text

In [6]:
suicidal_tweets=pd.read_csv("suicidal_data.csv")


In [7]:
tqdm.pandas()
suicidal_tweets['tweet'] = suicidal_tweets['tweet'].progress_apply(preprocess_tweet)

100%|██████████| 9119/9119 [00:00<00:00, 31106.90it/s]


In [8]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [9]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [11]:
[w for w in tokenizer_porter('a swimmer likes swimming and swims a lot') if w not in stop]

['swimmer', 'like', 'swim', 'swim', 'lot']

In [12]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

# Using the Hashing Vectorizer

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

# Building the ml model

In [14]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

In [15]:
X = suicidal_tweets["tweet"].to_list()
y = suicidal_tweets['label']

# Training the model

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [17]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [18]:
classes = np.array([0, 1])
clf.partial_fit(X_train, y_train,classes=classes)

SGDClassifier(loss='log', random_state=1)

In [19]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.912


# Testing

In [20]:
label = {0:'negative', 1:'positive'}

In [21]:
phrases = ["suicide","suicidal", "kill myself", "my suicide note", "my suicide letter", "end my life", "never wake up", "can't go on"," cannot go on", "not worth living", "ready to jump", "swleep forever", "want to die", "be dead", "better off without me", "better off dead", "don't want to being here", "tired of living", "die alone", "go to sleep forever", "wanna die", "wanna suicide", "commit suicide", "slit my wrist","cut my wrist", "slash my wrist", "do not want to be here", "want it to be over", "want to be dead", "nothing to live for", "not worth living","ready to die", "thoughts of suicide", "thoughts of killing myself", "why should i live", "take my own life", "depressed"]

In [37]:
from config import create_api


In [59]:
streamed_tweets = []
for query in phrases:
    api = create_api()
    tweets = api.search(q=query + " --filter:retweets", lang="en", show_user=True, result_type="recent")
    for i in tweets:
        print(i.user.screen_name, ":" ,i.user.location)
        streamed_tweets.append({'tweet_id': i.id, 'tweet_text': i.text, 'user_name': i.user.screen_name, 'tweet_loc': i.user.location})
        

AmeerHa09250563 : 
Alison1Eve : 🇺🇸 🌍
Pyro_mariner : Leeds init
Rehmaan62546446 : 
nzvx__ : 
2playababyy : Hell
Rfranz19 : florida
spicyyy15 : 
Sani50547051 : 
sarahma50398302 : 
Ali98088843 : 
md96dawood : 🎈🎉سات #سمندر#پار🎉🎈
IAmJerdog : Kansas, USA
HaiderA39951009 : 
HanishaSai1 : 
Jakama_Snr : Kenya
snorpion_ : she/her | 23 | 🇨🇵
MarindaBender : 
nurulhsaf : Comrade Hafizul
NiKhOoOLe : she/her • bi || 📍🇰🇪 
moaningminnie58 : 
_tuthe : kiambu, Kenya
LegionnaireNo1 : 
Druhin13 : Kolkata, India
reenafndy : she/her 🏳️‍🌈🏳️‍🌈
tanupandey4 : 
kevin_kiugi : United States
listen2victims : Los Angeles, CA
chat2u2 : Australia
notyourfaves : Pengerang, Johor
faeriecardan : cr: six of crows
melissarxv : 
CaitlinHarvey22 : 
_katxe__ : 
evstokes96 : 
zchrollosbaby : 
mrsdeeznutsguy : deez nuts
damosmith__ : Livingston, Scotland
inkalacarter : eMalahleni, South Africa
letatdemoi : 
moonxsh : 64 zoo lane
JohnFcknStewart : 
OdBITO : Cabuyao, Calabarzon
not_reiii : eroda
Rajnees34468386 : UP, India
Poon971

KeyboardInterrupt: 

In [60]:
streamed_tweet_df=pd.DataFrame.from_dict(streamed_tweets)
streamed_tweet_df

Unnamed: 0,tweet_id,tweet_text,user_name,tweet_loc
0,1330477149816549380,RT @soldierspeaks: An #Indian soldier of 39 RR...,AmeerHa09250563,
1,1330477126043262976,RT @DIEBO37: New Chinese Weapon Can Launch Sui...,Alison1Eve,🇺🇸 🌍
2,1330477124193570818,RT @FredTJoseph: Kalief Browder died by suicid...,Pyro_mariner,Leeds init
3,1330477119701471234,RT @soldierspeaks: An #Indian soldier of 39 RR...,Rehmaan62546446,
4,1330477114131238914,RT @acidwheep: TW // suicide attempt \nits b...,nzvx__,
...,...,...,...,...
84,1330474598928953346,"RT @AliceBridget5: I am lost.. Far, so far fro...",RedEyesBiDrago1,
85,1330474475582861318,RT @jungkookie496: 💫 BTS 'BE' Album (Deluxe Ed...,Yyj6663,"Perak, Malaysia"
86,1330473898329182214,RT @nunubebe0618: 🐻 oh really? \n👤 yes! \n🐻 ah...,sohwnumx,"25+, not following twin. ND"
87,1330473884450222085,RT @DBurr_: How on earth do we end up with thi...,MaaattyB,Inverurie


In [61]:
analysis= pd.DataFrame(columns={"Prediction","Probability"})
analysis["Prediction"].astype(object)
analysis["Probability"].astype(float)

Series([], Name: Probability, dtype: float64)

In [62]:
for row in range(0,len(streamed_tweet_df)):
    example=[streamed_tweet_df.iloc[row,1]]
    X = vect.transform(example)
    print('Prediction: %s   Probability: %.2f%%'
    %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))
    analysis=analysis.append({'Prediction': label[clf.predict(X)[0]],
                                             'Probability': np.max(clf.predict_proba(X))*100},
                                            ignore_index=True)

Prediction: negative   Probability: 60.07%
Prediction: negative   Probability: 85.15%
Prediction: negative   Probability: 56.28%
Prediction: negative   Probability: 60.07%
Prediction: positive   Probability: 83.34%
Prediction: positive   Probability: 53.47%
Prediction: negative   Probability: 56.28%
Prediction: negative   Probability: 56.28%
Prediction: negative   Probability: 60.07%
Prediction: positive   Probability: 62.76%
Prediction: negative   Probability: 60.07%
Prediction: negative   Probability: 60.07%
Prediction: negative   Probability: 56.28%
Prediction: negative   Probability: 60.07%
Prediction: negative   Probability: 92.65%
Prediction: positive   Probability: 89.85%
Prediction: negative   Probability: 63.14%
Prediction: negative   Probability: 94.75%
Prediction: negative   Probability: 59.24%
Prediction: positive   Probability: 89.85%
Prediction: negative   Probability: 53.60%
Prediction: positive   Probability: 89.85%
Prediction: positive   Probability: 89.85%
Prediction:

In [63]:
merge=streamed_tweet_df.merge(analysis, how="inner", left_index=True, right_index=True)
merge

Unnamed: 0,tweet_id,tweet_text,user_name,tweet_loc,Prediction,Probability
0,1330477149816549380,RT @soldierspeaks: An #Indian soldier of 39 RR...,AmeerHa09250563,,negative,60.070946
1,1330477126043262976,RT @DIEBO37: New Chinese Weapon Can Launch Sui...,Alison1Eve,🇺🇸 🌍,negative,85.146959
2,1330477124193570818,RT @FredTJoseph: Kalief Browder died by suicid...,Pyro_mariner,Leeds init,negative,56.283844
3,1330477119701471234,RT @soldierspeaks: An #Indian soldier of 39 RR...,Rehmaan62546446,,negative,60.070946
4,1330477114131238914,RT @acidwheep: TW // suicide attempt \nits b...,nzvx__,,positive,83.340849
...,...,...,...,...,...,...
84,1330474598928953346,"RT @AliceBridget5: I am lost.. Far, so far fro...",RedEyesBiDrago1,,negative,88.711561
85,1330474475582861318,RT @jungkookie496: 💫 BTS 'BE' Album (Deluxe Ed...,Yyj6663,"Perak, Malaysia",negative,92.288782
86,1330473898329182214,RT @nunubebe0618: 🐻 oh really? \n👤 yes! \n🐻 ah...,sohwnumx,"25+, not following twin. ND",negative,62.513943
87,1330473884450222085,RT @DBurr_: How on earth do we end up with thi...,MaaattyB,Inverurie,positive,52.275340


In [64]:
mask = (merge['Probability'].ge(80)) & (merge['Prediction']=='positive')
merge['mask'] = np.where(mask,'Flagged', 'no problem')
merge

Unnamed: 0,tweet_id,tweet_text,user_name,tweet_loc,Prediction,Probability,mask
0,1330477149816549380,RT @soldierspeaks: An #Indian soldier of 39 RR...,AmeerHa09250563,,negative,60.070946,no problem
1,1330477126043262976,RT @DIEBO37: New Chinese Weapon Can Launch Sui...,Alison1Eve,🇺🇸 🌍,negative,85.146959,no problem
2,1330477124193570818,RT @FredTJoseph: Kalief Browder died by suicid...,Pyro_mariner,Leeds init,negative,56.283844,no problem
3,1330477119701471234,RT @soldierspeaks: An #Indian soldier of 39 RR...,Rehmaan62546446,,negative,60.070946,no problem
4,1330477114131238914,RT @acidwheep: TW // suicide attempt \nits b...,nzvx__,,positive,83.340849,Flagged
...,...,...,...,...,...,...,...
84,1330474598928953346,"RT @AliceBridget5: I am lost.. Far, so far fro...",RedEyesBiDrago1,,negative,88.711561,no problem
85,1330474475582861318,RT @jungkookie496: 💫 BTS 'BE' Album (Deluxe Ed...,Yyj6663,"Perak, Malaysia",negative,92.288782,no problem
86,1330473898329182214,RT @nunubebe0618: 🐻 oh really? \n👤 yes! \n🐻 ah...,sohwnumx,"25+, not following twin. ND",negative,62.513943,no problem
87,1330473884450222085,RT @DBurr_: How on earth do we end up with thi...,MaaattyB,Inverurie,positive,52.275340,no problem


In [65]:
for row in range(len(merge)):
        if merge['mask'][row]=='Flagged':
            print(row, merge['user_name'][row], merge['tweet_text'][row], merge['tweet_loc'][row])

4 nzvx__ RT @acidwheep: TW // suicide attempt   
its been a year since my first suicide attempt and a month since my last attempt, the fact that i’m… 
15 Jakama_Snr RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu… Kenya
19 NiKhOoOLe RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu… she/her • bi || 📍🇰🇪 
21 _tuthe RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu… kiambu, Kenya
22 LegionnaireNo1 RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu… 
24 reenafndy RT @sapiqmalik: tw// Suicide &amp; suicidal thoughts

Please don't mock his choice that he took his own life here.

To those who doesn't unders… she/her 🏳️‍🌈🏳️‍🌈
26 kevin_kiug

In [74]:
def send():
    user_list = {}
    for row in range(len(merge)):
        if merge['mask'][row]=='Flagged':
            user_list[merge['user_name'][row]] = merge['tweet_text'][row]
    return(user_list)

In [75]:
send()

{'nzvx__': 'RT @acidwheep: TW // suicide attempt   \nits been a year since my first suicide attempt and a month since my last attempt, the fact that i’m…',
 'Jakama_Snr': "RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu…",
 'NiKhOoOLe': "RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu…",
 '_tuthe': "RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu…",
 'LegionnaireNo1': "RT @nahashon87: I've seen many people depressed and almost suicidal because their parents demand too much from them. Please, usikie uchungu…",
 'reenafndy': "RT @sapiqmalik: tw// Suicide &amp; suicidal thoughts\n\nPlease don't mock his choice that he took his own life here.\n\nTo those who doesn't unders…",
 'kevin_kiugi': "RT @nahashon87: I've seen 