In [1]:
!pip install jieba
!pip install paddlepaddle-tiny==1.6.1
!pip install stopwordsiso
!pip install shap

'''General'''
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import stopwordsiso
from stopwordsiso import stopwords
import jieba
import jieba.posseg as pseg
import time
import collections

'''Features'''
import shap 
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

'''Classifiers'''
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

'''CNN'''
import gensim
import tensorflow as tf
from gensim.models import Word2Vec
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model

'''Metrics/Evaluation'''
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



Using TensorFlow backend.


In [2]:
depressed = pd.read_json('depressed.json')
normal = pd.read_json('normal.json')
normal = normal.loc[normal['label']==0]

In [3]:
print(depressed.shape)
print(normal.shape)

(10325, 11)
(10848, 11)


In [4]:
depressed.head()

Unnamed: 0,all_tweet_count,birthday,gender,label,nickname,num_of_follower,num_of_following,original_tweet_count,profile,repost_tweet_count,tweets
0,10,1993-12-10,男,1,迷失路径,1,0,10,此人严重丧，不适绕行，谢谢,0,"[{'tweet_content': '有点累想休息了', 'posting_time': ..."
1,219,1997-06-07,女,1,九七97吖,173,76,79,无,70,"[{'tweet_content': '轉發微博', 'posting_time': '20..."
2,6188,无,女,1,姑先十八-,22,205,24,喜提呼伦贝尔大草原五个月。,5,[{'tweet_content': '抑郁症的普及它好就好在 以前人抑郁的时候想死 现在人...
3,62,无,女,1,大邱百香果,2,9,17,无,6,"[{'tweet_content': '', 'posting_time': '2017-0..."
4,6,2001-09-25,女,1,落墨暮烟,1,27,6,无,0,[{'tweet_content': '所有人都问我你没事吧你还好吧 我笑着一一回答我很好啊...


In [5]:
normal.head()

Unnamed: 0,all_tweet_count,birthday,gender,label,nickname,num_of_follower,num_of_following,original_tweet_count,profile,repost_tweet_count,tweets
1,2877,2000-08-02,女,0,心无挂碍wxq,581,968,98,传播正能量，广结善缘。每天懂一点佛学知识，感悟人生，净化身心，弘扬佛法，传播中国传统文化，便...,0,"[{'tweet_content': '无', 'posting_time': '2020-..."
2,1326,1998-04-10,女,0,透心凉A兔子,1363,241,97,无,1,[{'tweet_content': '单身狗不能接受的路过 还有谁不知道里的求婚场面今晚的...
4,1883,无,男,0,路路090909,141,1480,62,所谓活着的人，就是不断挑战的人，不断攀登命运峻峰的人。,38,"[{'tweet_content': '我不喜欢这世界我只喜欢你', 'posting_ti..."
6,398,无,女,0,Yuancuxin,6649,130,94,浪漫至死不渝?,6,[{'tweet_content': '大抵是因为身边人都很温柔 所以我经不起一点点凶 嘿嘿...
7,258,无,女,0,梦醒泪落88,1400,863,86,88年老阿姨！！私有财产两个儿子??,4,"[{'tweet_content': '快手千万现金红包等你来拿', 'posting_ti..."


In [6]:
depressed.isna().sum()

all_tweet_count         0
birthday                0
gender                  0
label                   0
nickname                0
num_of_follower         0
num_of_following        0
original_tweet_count    0
profile                 0
repost_tweet_count      0
tweets                  0
dtype: int64

In [7]:
normal.isna().sum()

all_tweet_count         0
birthday                0
gender                  0
label                   0
nickname                0
num_of_follower         0
num_of_following        0
original_tweet_count    0
profile                 0
repost_tweet_count      0
tweets                  0
dtype: int64

In [8]:
STOPWORDS = stopwords(["zh"])  # Chinese
STOPWORDS.update({"轉發微博", "轉發", "微博"})
len(STOPWORDS)


797

In [9]:
def isChinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return uchar
    else:
        return ' '

def format_str(content):
    content_str = ''
    for i in content:
        content_str = content_str + isChinese(i)
    return content_str

In [10]:
# !pip install emoji
# from emoji import UNICODE_EMOJI

# def is_emoji(s):
#     return s in UNICODE_EMOJI

# def count_emoji(content):
#     content_str = ''
#     for i in content:
#         if i in UNICODE_EMOJI:
#             content_str = content_str + i
#     return content_str

# Data Pre-processing
- Data cleaning
- Feature engineering
- EDA

In [11]:
jieba.enable_paddle() 
# jieba.enable_parallel() will cause error if enabled here

def clean_data(df):
    df = df[['tweets', 'label']]
    # df = df[['tweets', 'label']].sample(1000, random_state=1)

    # extract all post content into flattern array
    df['tweets_flat'] = df['tweets'].apply(lambda tweets: ' '.join([tweet['tweet_content'] for tweet in tweets]))
    # remove alphanumerical and special chars, keep only Chinese
    df['tweets_chinese'] = df['tweets_flat'].apply(lambda tweets: format_str(tweets))
    # text segmentation using JIEBA, paddle mode
    df['tweets_cut'] = df['tweets_chinese'].apply(lambda t: ' '.join(list(jieba.cut(t, 
                                                                                use_paddle=True
    #                                                                           cut_all=True
                                                                                ))))
    # remove stopwords
    df['tweets_clean'] = df['tweets_cut'].apply(lambda t: ' '.join([word for word in t.split(' ') if word not in STOPWORDS ]))
    
    return df


pos_cols = ['n', 'f', 's', 't', 'nr', 'ns', 'nt', 'nw', 'nz', 'v', 'vd', 'vn', 'a', 'ad', 'an', 'd', 'm', 'q', 'r', 'p', 'c', 'u', 'xc', 'w', 'PER', 'LOC', 'ORG', 'TIME', 'O']

# generate pos tag features
def generate_pos(df):   
    df = df.join(pd.DataFrame(
        [[0]*29], 
        index=df.index, 
        columns=pos_cols
    ))
    for index, row in df.iterrows():
        words = pseg.cut(row['tweets_chinese'],use_paddle=True)
        for word, flag in words:
            row[flag] += 1
        df.at[index] = row
        
    return df


sentiment_dict = "BosonNLP_sentiment_score.txt"
with open(sentiment_dict) as f:
    boson_dict = f.read().splitlines() 

# generate pos tag features
def generate_sentiment(df):
    pos = []
    neg = []
    
    # sentiment threshold, the greater the value, the stronger the emotion
    threshold = 1

    # chars to be removed in sentiment dict
    pattern = re.compile("[A-Za-z0-9.·:_=／￥—@#%\-\+\\\]+")

    # filter dict by threhold and pattern
    for w in boson_dict:
        if w == '':
            pass
        else:
            word = w.split(' ')[0]
            value = w.split(' ')[1]
            if float(value) >= threshold and not pattern.match(word):
                pos.append(word)
            elif float(value) <= threshold and not pattern.match(word):
                neg.append(word)
#     print('positive length: %d, negative length: %d' %(len(pos),len(neg)))

    df = df.drop(['n_pos','n_neg'], axis=1, errors='ignore')
    df['n_pos'] = df['tweets_clean'].apply(lambda x: np.isin(list(filter(str.strip,x.split(" "))), pos).sum())
    df['n_neg'] = df['tweets_clean'].apply(lambda x: np.isin(list(filter(str.strip,x.split(" "))), neg).sum())
    
    return df

Paddle enabled successfully......


In [12]:
users = pd.concat([depressed, normal], axis=0)
print(users.shape)

(21173, 11)


In [14]:
# process in chunk
# if load all data into memory and run pre-process will cause kernel died

start = 0
offset = 1000

while start < 21173: 
    print('progress:', start)
    
    limit = start + offset
    df = clean_data(users.iloc[start:limit])
    df = generate_pos(df)
    df = generate_sentiment(df)
    df.to_csv("cleaned_"+str(start)+'-'+str(limit)+'.csv', index=False)
    
    start = limit


progress: 14200
progress: 15200
progress: 16200
progress: 17200
progress: 18200
progress: 19200
progress: 20200


In [17]:
# read back chunked results and concate together

import glob

df_train = pd.DataFrame()
for file in glob.glob("cleaned_*.csv"):
    df = pd.read_csv(file)
    if df_train.empty:
        df_train = df
    else:
        df_train = df_train.append(df)

df_train.head()

Unnamed: 0,tweets,label,tweets_flat,tweets_chinese,tweets_cut,tweets_clean,n,f,s,t,nr,ns,nt,nw,nz,v,vd,vn,a,ad,an,d,m,q,r,p,c,u,xc,w,PER,LOC,ORG,TIME,O,n_pos,n_neg
0,"[{'tweet_content': '请多多注意防护预防感染常戴口罩', 'posting...",0,请多多注意防护预防感染常戴口罩 走在乡间的小路上乡村风景美如画 空运的螃蟹到了快来领取啊 何...,请多多注意防护预防感染常戴口罩 走在乡间的小路上乡村风景美如画 空运的螃蟹到了快来领取啊 何...,请 多多 注意 防护 预防 感染 常 戴口罩 走 在 乡间 的 小路 上 乡村 风景 美...,多多 注意 防护 预防 感染 常 戴口罩 走 乡间 小路 乡村 风景 美如画 空运 螃蟹...,562,47,18,16,47,3,9,1,48,763,6,67,168,14,9,202,79,13,296,125,82,337,36,2,24,109,13,14,0,330,1095
1,"[{'tweet_content': '你敢不敢让我中个小恐龙', 'posting_tim...",0,你敢不敢让我中个小恐龙 周年庆邀你盛装出席 5月1日5月8日参与周年庆活动分享你的周年庆时装...,你敢不敢让我中个小恐龙 周年庆邀你盛装出席 月 日 月 日参与周年庆活动分享你的周年庆时装...,你 敢不敢 让 我 中 个 小 恐龙 周年庆 邀 你 盛装 出席 月 日 月 日 参...,敢不敢 恐龙 周年庆 邀 盛装 出席 参与 周年庆 活动 分享 周年庆 时装 穿 搭...,364,24,7,7,67,0,3,0,12,645,1,18,126,14,3,194,44,21,223,50,36,175,76,3,7,8,1,41,0,151,702
2,[{'tweet_content': '滴 613打卡成功 天气好的一天 看见小泽自拍 心情...,0,滴 613打卡成功 天气好的一天 看见小泽自拍 心情更加好 以后多发呗 早上好呀天泽 天气好...,滴 打卡成功 天气好的一天 看见小泽自拍 心情更加好 以后多发呗 早上好呀天泽 天气好...,滴 打卡 成功 天气 好 的 一天 看见 小泽 自拍 心情 更加 好 ...,滴 打卡 成功 天气 一天 看见 小泽 自拍 心情 更加 以后 多发 ...,81,13,0,0,16,0,0,0,1,116,1,1,38,2,0,22,2,1,11,3,3,31,28,0,13,0,0,28,0,54,123
3,[{'tweet_content': '有没有人跟我一样的卡了我2天52个精英令用都用不鸟这...,0,有没有人跟我一样的卡了我2天52个精英令用都用不鸟这波过去不卸载我是 我在打卡啦每日签到领红...,有没有人跟我一样的卡了我 天 个精英令用都用不鸟这波过去不卸载我是 我在打卡啦每日签到领红...,有没有 人 跟 我 一样 的 卡 了 我 天 个 精英令 用 都 用 不 鸟 这 波 ...,有没有 卡 天 精英令 鸟 波 过去 卸载 打卡 每日 签到 领 红包 签到 越多...,292,6,9,5,32,0,3,0,16,450,2,23,84,9,13,111,35,9,142,52,19,143,52,2,9,2,5,10,0,159,490
4,[{'tweet_content': '婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相...,0,婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相伴一生的人而且不管前路好坏面对诱惑时你还...,婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相伴一生的人而且不管前路好坏面对诱惑时你还...,婚姻 不分 年龄 希望 决定 结婚之前 能够 想清楚 对方 是 你 想要 相伴 一生 的 人...,婚姻 不分 年龄 希望 决定 结婚之前 能够 想清楚 想要 相伴 一生 前路 好坏 面对 诱...,341,23,4,18,23,0,1,0,7,632,3,37,124,17,11,232,44,7,227,67,54,187,22,0,18,27,6,13,0,230,662


In [30]:
df_train.shape

(21073, 38)

In [20]:
fe_cols = pos_cols+['n_pos', 'n_neg']

df_train['word_count'] = df_train.iloc[:, -31:-2].sum(axis=1)
for col in fe_cols:
    df_train[col] = df_train[col]/df_train['word_count']
df_train.head()

Unnamed: 0,tweets,label,tweets_flat,tweets_chinese,tweets_cut,tweets_clean,n,f,s,t,nr,ns,nt,nw,nz,v,vd,vn,a,ad,an,d,m,q,r,p,c,u,xc,w,PER,LOC,ORG,TIME,O,n_pos,n_neg,word_count
0,"[{'tweet_content': '请多多注意防护预防感染常戴口罩', 'posting...",0,请多多注意防护预防感染常戴口罩 走在乡间的小路上乡村风景美如画 空运的螃蟹到了快来领取啊 何...,请多多注意防护预防感染常戴口罩 走在乡间的小路上乡村风景美如画 空运的螃蟹到了快来领取啊 何...,请 多多 注意 防护 预防 感染 常 戴口罩 走 在 乡间 的 小路 上 乡村 风景 美...,多多 注意 防护 预防 感染 常 戴口罩 走 乡间 小路 乡村 风景 美如画 空运 螃蟹...,0.1953,0.0163,0.0063,0.0056,0.0163,0.001,0.0031,0.0003,0.0167,0.2651,0.0021,0.0233,0.0584,0.0049,0.0031,0.0702,0.0274,0.0045,0.1028,0.0434,0.0285,0.1171,0.0125,0.0007,0.0083,0.0379,0.0045,0.0049,0.0,0.1147,0.3805,2878
1,"[{'tweet_content': '你敢不敢让我中个小恐龙', 'posting_tim...",0,你敢不敢让我中个小恐龙 周年庆邀你盛装出席 5月1日5月8日参与周年庆活动分享你的周年庆时装...,你敢不敢让我中个小恐龙 周年庆邀你盛装出席 月 日 月 日参与周年庆活动分享你的周年庆时装...,你 敢不敢 让 我 中 个 小 恐龙 周年庆 邀 你 盛装 出席 月 日 月 日 参...,敢不敢 恐龙 周年庆 邀 盛装 出席 参与 周年庆 活动 分享 周年庆 时装 穿 搭...,0.186,0.0123,0.0036,0.0036,0.0342,0.0,0.0015,0.0,0.0061,0.3296,0.0005,0.0092,0.0644,0.0072,0.0015,0.0991,0.0225,0.0107,0.1139,0.0255,0.0184,0.0894,0.0388,0.0015,0.0036,0.0041,0.0005,0.021,0.0,0.0772,0.3587,1957
2,[{'tweet_content': '滴 613打卡成功 天气好的一天 看见小泽自拍 心情...,0,滴 613打卡成功 天气好的一天 看见小泽自拍 心情更加好 以后多发呗 早上好呀天泽 天气好...,滴 打卡成功 天气好的一天 看见小泽自拍 心情更加好 以后多发呗 早上好呀天泽 天气好...,滴 打卡 成功 天气 好 的 一天 看见 小泽 自拍 心情 更加 好 ...,滴 打卡 成功 天气 一天 看见 小泽 自拍 心情 更加 以后 多发 ...,0.2109,0.0339,0.0,0.0,0.0417,0.0,0.0,0.0,0.0026,0.3021,0.0026,0.0026,0.099,0.0052,0.0,0.0573,0.0052,0.0026,0.0286,0.0078,0.0078,0.0807,0.0729,0.0,0.0339,0.0,0.0,0.0729,0.0,0.1406,0.3203,384
3,[{'tweet_content': '有没有人跟我一样的卡了我2天52个精英令用都用不鸟这...,0,有没有人跟我一样的卡了我2天52个精英令用都用不鸟这波过去不卸载我是 我在打卡啦每日签到领红...,有没有人跟我一样的卡了我 天 个精英令用都用不鸟这波过去不卸载我是 我在打卡啦每日签到领红...,有没有 人 跟 我 一样 的 卡 了 我 天 个 精英令 用 都 用 不 鸟 这 波 ...,有没有 卡 天 精英令 鸟 波 过去 卸载 打卡 每日 签到 领 红包 签到 越多...,0.2083,0.0043,0.0064,0.0036,0.0228,0.0,0.0021,0.0,0.0114,0.321,0.0014,0.0164,0.0599,0.0064,0.0093,0.0792,0.025,0.0064,0.1013,0.0371,0.0136,0.102,0.0371,0.0014,0.0064,0.0014,0.0036,0.0071,0.0,0.1134,0.3495,1402
4,[{'tweet_content': '婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相...,0,婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相伴一生的人而且不管前路好坏面对诱惑时你还...,婚姻不分年龄希望决定结婚之前能够想清楚对方是你想要相伴一生的人而且不管前路好坏面对诱惑时你还...,婚姻 不分 年龄 希望 决定 结婚之前 能够 想清楚 对方 是 你 想要 相伴 一生 的 人...,婚姻 不分 年龄 希望 决定 结婚之前 能够 想清楚 想要 相伴 一生 前路 好坏 面对 诱...,0.1676,0.0113,0.002,0.0088,0.0113,0.0,0.0005,0.0,0.0034,0.3107,0.0015,0.0182,0.061,0.0084,0.0054,0.1141,0.0216,0.0034,0.1116,0.0329,0.0265,0.0919,0.0108,0.0,0.0088,0.0133,0.0029,0.0064,0.0,0.1131,0.3255,2034


In [22]:
df_train.to_csv('df_train.csv', index=False)

# BOW

In [37]:
count_vect = CountVectorizer(binary=False, max_df=0.6, min_df=5)
#                             token_pattern='[a-z]{2,15}', 
                            
X = count_vect.fit_transform(df_train['tweets_clean'].astype('str')) # text features
y = df_train['label'].values # target

X_fe = df_train[fe_cols].to_numpy()
X = hstack([X, X_fe])

print(X.shape)
print(y.shape)


(21073, 115217)
(21073,)


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 1, test_size=0.3)

nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

print(classification_report(y_test, nb_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3224
           1       0.91      0.93      0.92      3098

    accuracy                           0.92      6322
   macro avg       0.92      0.92      0.92      6322
weighted avg       0.92      0.92      0.92      6322



In [33]:
# feature importance for Navie Bayes
normal_prob_sorted = nb.feature_log_prob_[0, :].argsort()[::-1]
depressed_prob_sorted = nb.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(count_vect.get_feature_names()+fe_cols, normal_prob_sorted[:10]))
print(np.take(count_vect.get_feature_names()+fe_cols, depressed_prob_sorted[:10]))

['视频' '一起' '中国' '红包' '努力' '人生' '快手' '工作' '孩子' '分享']
['痛苦' '抑郁症' '抑郁' '情绪' '孩子' '不知道' '一个人' '事情' '很多' '其实']


# TFIDF

In [129]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 1), min_df=10)

X = tfidf_vect.fit_transform(df_train['tweets_clean'].astype('str')) # text features
y = df_train['label'].values # target

X_fe = df_train[fe_cols].to_numpy()
X = hstack([X, X_fe])

print (X.shape)
print(y.shape)

(21073, 70853)
(21073,)


In [26]:

#Preliminary model evaluation using default parameters

#Creating a dict of the models
model_dict = {'Logistic Regression': LogisticRegression(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decsision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'XGBoost': XGBClassifier(random_state=3),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .2, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

print (X_train.shape)
print(y_train.shape)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

(16858, 70853)
(16858,)


Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
5,XGBoost,0.958,0.9587,0.9577,0.958
4,AdaBoost,0.9544,0.9549,0.9542,0.9544
0,Logistic Regression,0.9521,0.9539,0.9515,0.952
1,Stochastic Gradient Descent,0.9499,0.9518,0.9494,0.9498
2,Random Forest,0.9473,0.9486,0.9469,0.9472
3,Decsision Tree,0.9345,0.9345,0.9345,0.9345
6,K Nearest Neighbor,0.8925,0.8925,0.8924,0.8925


In [65]:
# X_train, X_test, y_train, y_test = train_test_split(X, 
#                                                     y, 
#                                                     test_size = .3, 
#                                                     shuffle = True, 
#                                                     stratify = y, 
#                                                     random_state = 3)

v = XGBClassifier(learning_rate=0.5, n_estimater=75, max_depth=3, objective='binary:logistic', random_state=3)
# v = XGBClassifier(objective='binary:logistic', random_state=3)


# v = AdaBoostClassifier(n_estimators=500, random_state=3)
v.fit(X_train, y_train)
y_pred = v.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, average='macro'))
print(recall_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='macro'))

# 0.9647263524201202
# 0.9650854385082227
# 0.9645073384413542
# 0.964694126545599

0.9615627965833597
0.9619342656020473
0.9613362178752962
0.9615269405404674


# Hyperparameter Tuning
- XGBoost
- AdaBoost
- Logistic Regression	

In [48]:
#Hyperparameter tuning
#Gridsearch with 5-fold cross validation

#LR
solver =  ['saga', 'liblinear', 'lbfgs']
penalty = ['l2','l1']
C = [100, 10, 1.0, 0.1]
max_iter = [100, 500]
random_state = [3]

lr = LogisticRegression()

params = dict(solver=solver,
              penalty=penalty,
              C=C,
              max_iter=max_iter,
              random_state=random_state)

gridsearch = GridSearchCV(lr,
                          params,
                          cv = 5,
                          scoring = 'recall',
                          verbose = 1, 
                          n_jobs = -1)

lr_best_model = gridsearch.fit(X, y)
# LogisticRegression(solver='liblinear', C=0.1, penalty='l2', random_state=3)

In [None]:
#Hyperparameter tuning
#Gridsearch with 5-fold cross validation
#Warning this can take a long time!!!

#SGD
loss =  ['log']
penalty = ['l2','l1']
alpha = [1e-6, 1e-3, 1e-1, 1e0]
max_iter = [5, 1000, 10000]
tol = [None, 1e-3]
eta0 = [0.1, 0.001]

random_state = [3]

clf = SGDClassifier()

params = dict(loss=loss,
              penalty=penalty,
              alpha=alpha,
              max_iter=max_iter,
              tol=tol,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          scoring = 'recall',
                          verbose = 1, 
                          n_jobs = -1)

sgd_best_model = gridsearch.fit(X, y)

In [27]:
#Hyperparameter tuning
#Gridsearch with 5-fold cross validation
#Warning this can take a long time!!!

#AdaBoost
n_estimators = [10, 50, 100, 500]
learning_rate = [0.0001, 0.001, 0.01, 0.1, 1.0]

random_state = [3]

clf = AdaBoostClassifier()

params = dict(n_estimators=n_estimators,
              learning_rate=learning_rate,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          scoring = 'recall',
                          verbose = 1, 
                          n_jobs = -1)

ada_best_model = gridsearch.fit(X, y)
# AdaBoostClassifier(n_estimators=500, random_state=3)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
#Hyperparameter tuning
#Gridsearch with 5-fold cross validation

#Xgbost
n_estimators = [50,75,100]
max_depth = [3,6,9]
learning_rate = [0.01,0.1,0.5]
random_state = [3]

clf = XGBClassifier(objective='binary:logistic',eval_metric='error')

params = dict(n_estimators=n_estimators,
              max_depth = max_depth,
              learning_rate = learning_rate,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          scoring = 'recall',
                          verbose = 1,
                          n_jobs = -1)

xgb_best_model = gridsearch.fit(X_train, y_train)

In [64]:
lr_best_model.best_estimator_

In [45]:
ada_best_model.best_estimator_

AdaBoostClassifier(n_estimators=500, random_state=3)

In [139]:
xgb_best_model.best_estimator_

# Test On New Dataset Test1

In [115]:
test_new_depressed = pd.read_csv('test_depressed_new.csv', index_col=0)
test_new_depressed['label'] = 1
test_new_normal = pd.read_csv('test_normal_new.csv', index_col=0)
test_new_normal['label'] = 0
# test_new = pd.concat([test_new_depressed, test_new_normal], axis=0)
# test_new.reset_index(drop=True)

In [116]:
test_new_depressed.head()

Unnamed: 0,tweets,label
0,姐姐妹妹 奇迹暖暖 真的没人觉得这个摆件像鞋垫吗？还是卷了的，超级不舒服的感觉 抑郁症抑...,1
1,总之不要放弃就对了吧？ 今天的花好好看 真的太敏感太脆弱了。再也受不了任何委屈了。,1
2,张国荣#十七周年继续宠爱张国荣# 茫然兩眼已漸紅 皆因想你 想他嗎？ #男朋友是怎么宠你...,1
3,手抖冒冷汗心跳快一下午了 真好 下雨又不用出门了 第五人格出个号子 网页链接成功出的话从...,1
4,我吃果子只是为了跟花有点联系——《顾城诗集》,1


In [117]:
test_new_normal.head()

Unnamed: 0,tweets,label
0,又染得比黑更黑 害 好想吃烤肉啊(指么子烤肉) 嗯嗯真好 // On April 2...,0
1,今天的菜有点优秀 阿迪耐克彪马不能买了 鬼塚虎没问题吧 这些牌子一作死 没鞋穿了 这次搬...,0
2,#集福牛开福运# 福运正当头，牛年我最牛！我在福运红包中开出了 @让红包飞 送出的【2.18...,0
3,#中国超2亿人单身#哈哈哈哈哈哈没有我我已经有金珉锡了哈哈哈哈😄 我在#森林驿站#帮助了大...,0
4,每日一善在微博多发大牌的名字 他会检测到你消费偏好比较高 然后提升你的信用分中奖率会提高我希...,0


In [118]:
# preprocess text on test data

test_new_depressed['tweets_chinese'] = test_new_depressed['tweets'].apply(lambda tweets: format_str(tweets))
  
test_new_depressed['tweets_cut'] = test_new_depressed['tweets_chinese'].apply(lambda t: ' '.join(list(jieba.cut(t, 
                                                                                 use_paddle=True
#                                                                                  cut_all=True
                                                                                ))))
test_new_depressed['tweets_clean'] = test_new_depressed['tweets_cut'].apply(lambda t: ' '.join([word for word in t.split(' ') if word not in STOPWORDS ]))

test_new_depressed = generate_pos(test_new_depressed)
test_new_depressed = generate_sentiment(test_new_depressed)


test_new_normal['tweets_chinese'] = test_new_normal['tweets'].apply(lambda tweets: format_str(tweets))
  
test_new_normal['tweets_cut'] = test_new_normal['tweets_chinese'].apply(lambda t: ' '.join(list(jieba.cut(t, 
                                                                                 use_paddle=True
#                                                                                  cut_all=True
                                                                                ))))
test_new_normal['tweets_clean'] = test_new_normal['tweets_cut'].apply(lambda t: ' '.join([word for word in t.split(' ') if word not in STOPWORDS ]))


test_new_normal = generate_pos(test_new_normal)
test_new_normal = generate_sentiment(test_new_normal)

test_new = pd.concat([test_new_depressed, test_new_normal], axis=0)

test_new['word_count'] = test_new.iloc[:, -31:-2].sum(axis=1)
for col in fe_cols:
    test_new[col] = test_new[col]/test_new['word_count']

test_new.head()


Unnamed: 0,tweets,label,tweets_chinese,tweets_cut,tweets_clean,n,f,s,t,nr,ns,nt,nw,nz,v,vd,vn,a,ad,an,d,m,q,r,p,c,u,xc,w,PER,LOC,ORG,TIME,O,n_pos,n_neg,word_count
0,姐姐妹妹 奇迹暖暖 真的没人觉得这个摆件像鞋垫吗？还是卷了的，超级不舒服的感觉 抑郁症抑...,1,姐姐妹妹 奇迹暖暖 真的没人觉得这个摆件像鞋垫吗 还是卷了的 超级不舒服的感觉 抑郁症抑...,姐姐 妹妹 奇迹 暖暖 真 的 没 人 觉得 这个 摆件 像 鞋垫 吗 还是 卷...,姐姐 妹妹 奇迹 暖暖 真 没 觉得 摆件 鞋垫 卷 超级 不舒服 感觉 ...,0.1571,0.013,0.0031,0.0061,0.0282,0.0,0.0,0.0,0.016,0.3204,0.0038,0.0061,0.0412,0.0084,0.0031,0.0694,0.0214,0.0031,0.132,0.0252,0.0229,0.0854,0.0137,0.0008,0.0031,0.0,0.0008,0.016,0.0,0.045,0.3547,1311
1,总之不要放弃就对了吧？ 今天的花好好看 真的太敏感太脆弱了。再也受不了任何委屈了。,1,总之不要放弃就对了吧 今天的花好好看 真的太敏感太脆弱了 再也受不了任何委屈了,总之 不要 放弃 就 对 了 吧 今天 的 花 好 好看 真 的 太 敏感 太 脆...,不要 放弃 今天 花 好看 真 太 敏感 太 脆弱 受不了 委屈,0.0769,0.0,0.0,0.0,0.0385,0.0,0.0,0.0,0.0,0.1154,0.0,0.0385,0.1154,0.0,0.0,0.2308,0.0,0.0,0.0385,0.0385,0.0385,0.1154,0.1154,0.0,0.0,0.0,0.0,0.0385,0.0,0.1154,0.3077,26
2,张国荣#十七周年继续宠爱张国荣# 茫然兩眼已漸紅 皆因想你 想他嗎？ #男朋友是怎么宠你...,1,张国荣 十七周年继续宠爱张国荣 茫然兩眼已漸紅 皆因想你 想他嗎 男朋友是怎么宠你...,张国荣 十七周年 继续 宠爱 张国荣 茫然 兩眼 已 漸 紅 皆因 想你 ...,张国荣 十七周年 继续 宠爱 张国荣 茫然 兩眼 漸 紅 皆因 想你 想 ...,0.0956,0.0,0.0,0.0,0.114,0.0,0.011,0.0,0.0257,0.2868,0.0257,0.0,0.0478,0.0,0.0,0.0404,0.0,0.0,0.1434,0.011,0.0037,0.0882,0.0294,0.0,0.0368,0.0037,0.0,0.0368,0.0,0.0882,0.2684,272
3,手抖冒冷汗心跳快一下午了 真好 下雨又不用出门了 第五人格出个号子 网页链接成功出的话从...,1,手抖冒冷汗心跳快一下午了 真好 下雨又不用出门了 第五人格出个号子 网页链接成功出的话从...,手抖 冒 冷汗 心跳 快 一 下午 了 真好 下雨 又 不用 出门 了 第五 ...,手抖 冷汗 心跳 快 下午 真好 下雨 不用 出门 第五 人格 出 号子 ...,0.165,0.0061,0.0017,0.005,0.0742,0.0,0.005,0.0011,0.0205,0.2935,0.0006,0.0094,0.0476,0.0078,0.0022,0.0714,0.0371,0.0083,0.0753,0.0194,0.01,0.0764,0.0305,0.0006,0.0144,0.0006,0.0011,0.0155,0.0,0.0952,0.3295,1806
4,我吃果子只是为了跟花有点联系——《顾城诗集》,1,我吃果子只是为了跟花有点联系 顾城诗集,我 吃 果子 只是 为了 跟 花 有点 联系 顾城 诗集,吃 果子 花 有点 联系 顾城 诗集,0.25,0.0,0.0,0.0,0.0833,0.0,0.0,0.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.1667,0.0,0.0,0.0833,0.1667,0.0,0.0,0.0,0.0,0.0833,0.0,0.0,0.0,0.0,0.3333,0.25,12


In [119]:
test_new.shape

(400, 37)

In [130]:

X_new = tfidf_vect.transform(test_new['tweets_clean'].astype('str')) # text features
X_fe = test_new[fe_cols].to_numpy()
X_new = hstack([X_new, X_fe])
y_new = test_new['label'].values # target

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X, y)
        y_pred = v.predict(X_new)
        ac_score_list.append(accuracy_score(y_new, y_pred))
        p_score_list.append(precision_score(y_new, y_pred, average='macro'))
        r_score_list.append(recall_score(y_new, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_new, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
5,XGBoost,0.84,0.8422,0.84,0.8397
3,Decsision Tree,0.83,0.8301,0.83,0.83
4,AdaBoost,0.8175,0.8179,0.8175,0.8174
0,Logistic Regression,0.7975,0.8022,0.7975,0.7967
1,Stochastic Gradient Descent,0.7825,0.7863,0.7825,0.7818
2,Random Forest,0.7625,0.7793,0.7625,0.7589
6,K Nearest Neighbor,0.695,0.6989,0.695,0.6935


In [132]:
#Creating a dict of the best models
best_model_dict = {'Logistic Regression': LogisticRegression(solver='liblinear', C=0.1, penalty='l2', random_state=3),
              'AdaBoost': AdaBoostClassifier(n_estimators=500, random_state=3),
              'XGBoost': XGBClassifier(learning_rate=0.5, n_estimater=75, max_depth=3, objective='binary:logistic', random_state=3)}

model_score_df(best_model_dict)

Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
2,XGBoost,0.8425,0.8427,0.8425,0.8425
1,AdaBoost,0.83,0.8305,0.83,0.8299
0,Logistic Regression,0.7625,0.7729,0.7625,0.7602


# Test On New Dataset Test2

In [133]:
test_2_depressed = pd.read_csv('test_depressed_new_2.csv', index_col=0)
test_2_depressed['label'] = 1

In [134]:
test_2_depressed.head()

Unnamed: 0,tweets,label
0,李佳琦是不是在我购物车和收藏夹里装了摄像头 翻到一张要素过多的截图 又买到一条合适舒服的...,1
1,生活碎片（1） 我对Yeezy是真的一直爱不起来完全没有想上脚的欲望… 为什么每次别人找...,1
2,机票又便宜了我不管不管人事调动多频繁 会不会影响到我出去玩的计划6月我该去成都还去哈哈哈哈哈...,1
3,抑郁症 最近做梦总是在挨骂，被亲近的人骂被不亲近的人骂，被认识的人骂被不认识的人骂，我记不清...,1
4,#三星堆遗址考古重大发现# 啊哈哈哈哈哈 三星堆发现新的文物 直接把三叔干上了热门 你们是存...,1


In [135]:
test_2_depressed['tweets_chinese'] = test_2_depressed['tweets'].apply(lambda tweets: format_str(tweets))
  
test_2_depressed['tweets_cut'] = test_2_depressed['tweets_chinese'].apply(lambda t: ' '.join(list(jieba.cut(t, 
                                                                                 use_paddle=True
#                                                                                  cut_all=True
                                                                                ))))
test_2_depressed['tweets_clean'] = test_2_depressed['tweets_cut'].apply(lambda t: ' '.join([word for word in t.split(' ') if word not in STOPWORDS ]))

test_2_depressed = generate_pos(test_2_depressed)
test_2_depressed = generate_sentiment(test_2_depressed)

test_2_depressed['word_count'] = test_2_depressed.iloc[:, -31:-2].sum(axis=1)
for col in fe_cols:
    test_2_depressed[col] = test_2_depressed[col]/test_2_depressed['word_count']

test_2_depressed.head()


Unnamed: 0,tweets,label,tweets_chinese,tweets_cut,tweets_clean,n,f,s,t,nr,ns,nt,nw,nz,v,vd,vn,a,ad,an,d,m,q,r,p,c,u,xc,w,PER,LOC,ORG,TIME,O,n_pos,n_neg,word_count
0,李佳琦是不是在我购物车和收藏夹里装了摄像头 翻到一张要素过多的截图 又买到一条合适舒服的...,1,李佳琦是不是在我购物车和收藏夹里装了摄像头 翻到一张要素过多的截图 又买到一条合适舒服的...,李佳琦 是不是 在 我 购物车 和 收藏夹 里 装 了 摄像头 翻 到 一张 要素 过...,李佳琦 是不是 购物车 收藏夹 里 装 摄像头 翻 一张 要素 过多 截图 买到...,0.1491,0.0118,0.0059,0.0033,0.0802,0.0007,0.0112,0.0,0.0125,0.2838,0.002,0.0085,0.0493,0.0099,0.0059,0.0618,0.0164,0.0026,0.0966,0.0256,0.0204,0.094,0.0171,0.0013,0.0085,0.002,0.0007,0.0191,0.0,0.0894,0.3035,1522
1,生活碎片（1） 我对Yeezy是真的一直爱不起来完全没有想上脚的欲望… 为什么每次别人找...,1,生活碎片 我对 是真的一直爱不起来完全没有想上脚的欲望 为什么每次别人找...,生活 碎片 我 对 是 真 的 一直 爱不起来 完全 没有 想 上 ...,生活 碎片 真 一直 爱不起来 完全 没有 想 脚 欲望 每...,0.1705,0.0127,0.0048,0.002,0.0479,0.0003,0.0054,0.0017,0.0107,0.2849,0.0008,0.0096,0.0558,0.0099,0.0028,0.0873,0.0217,0.0073,0.0682,0.0279,0.0144,0.095,0.0155,0.0008,0.0031,0.0056,0.0014,0.0321,0.0,0.0882,0.3364,3549
2,机票又便宜了我不管不管人事调动多频繁 会不会影响到我出去玩的计划6月我该去成都还去哈哈哈哈哈...,1,机票又便宜了我不管不管人事调动多频繁 会不会影响到我出去玩的计划 月我该去成都还去哈哈哈哈哈...,机票 又 便宜 了 我 不管 不管 人事 调动 多 频繁 会不会 影响 到 我 出去玩 ...,机票 便宜 人事 调动 频繁 会不会 影响 出去玩 计划 成都 还去 人事变动 频...,0.1364,0.0128,0.0059,0.0037,0.0302,0.0,0.0017,0.0,0.0044,0.3073,0.0004,0.0068,0.0533,0.0059,0.0033,0.0966,0.0154,0.0048,0.1046,0.0233,0.0283,0.0931,0.0292,0.0017,0.0065,0.0039,0.0004,0.02,0.0,0.0555,0.298,5402
3,抑郁症 最近做梦总是在挨骂，被亲近的人骂被不亲近的人骂，被认识的人骂被不认识的人骂，我记不清...,1,抑郁症 最近做梦总是在挨骂 被亲近的人骂被不亲近的人骂 被认识的人骂被不认识的人骂 我记不清...,抑郁症 最近 做梦 总是 在 挨骂 被 亲近 的 人 骂 被 不 亲近 的 人 骂 ...,抑郁症 最近 做梦 总是 挨骂 亲近 骂 亲近 骂 认识 骂 认识 骂 记 ...,0.1691,0.0,0.0048,0.0048,0.029,0.0,0.0,0.0,0.029,0.3333,0.0,0.0145,0.0242,0.0048,0.0,0.087,0.0242,0.0,0.087,0.0242,0.0,0.0676,0.029,0.0048,0.0386,0.0,0.0,0.0242,0.0,0.1401,0.2705,207
4,#三星堆遗址考古重大发现# 啊哈哈哈哈哈 三星堆发现新的文物 直接把三叔干上了热门 你们是存...,1,三星堆遗址考古重大发现 啊哈哈哈哈哈 三星堆发现新的文物 直接把三叔干上了热门 你们是存...,三星堆遗址 考古 重大 发现 啊 哈哈 哈哈 哈 三星堆 发现 新 的 文物 直...,三星堆遗址 考古 重大 发现 三星堆 发现 新 文物 直接 三叔 干 热门 存...,0.1474,0.0147,0.0028,0.0022,0.0592,0.0,0.0081,0.0016,0.0197,0.2607,0.0006,0.0056,0.0523,0.005,0.0019,0.0551,0.0166,0.0053,0.0977,0.0194,0.011,0.1139,0.041,0.0009,0.0354,0.0022,0.0034,0.0163,0.0,0.0895,0.2776,3195


In [136]:
test_2_depressed.shape

(100, 37)

In [137]:
#Creating a dict of the best models
best_model_dict = {'Logistic Regression': LogisticRegression(solver='liblinear', C=0.1, penalty='l2', random_state=3),
              'AdaBoost': AdaBoostClassifier(n_estimators=500, random_state=3),
              'XGBoost': XGBClassifier(learning_rate=0.5, n_estimater=75, max_depth=3, objective='binary:logistic', random_state=3)}

X_new = tfidf_vect.transform(test_2_depressed['tweets_clean'].astype('str')) # text features
X_fe = test_2_depressed[fe_cols].to_numpy()
X_new = hstack([X_new, X_fe])
y_new = test_2_depressed['label'].values # target

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X, y)
        y_pred = v.predict(X_new)
        ac_score_list.append(accuracy_score(y_new, y_pred))
        p_score_list.append(precision_score(y_new, y_pred, average='macro'))
        r_score_list.append(recall_score(y_new, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_new, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(best_model_dict)

Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
1,AdaBoost,0.62,0.5,0.31,0.3827
2,XGBoost,0.61,0.5,0.305,0.3789
0,Logistic Regression,0.46,0.5,0.23,0.3151
