### IMPORT NECESSARY LIBRARIES/FRAMEWORKS/DEPENDENCIES

In [3]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.42.0-py3-none-any.whl (191 kB)
[K     |████████████████████████████████| 191 kB 3.6 MB/s eta 0:00:01
Collecting seqeval
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Building wheels for collected packages: seqeval


  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7423 sha256=6376c2d35ebf130a2f56a6c7f54abc1997bb3c3e311faaa864fb9dafa9ffd134
  Stored in directory: /root/.cache/pip/wheels/dc/cc/62/a3b81f92d35a80e39eb9b2a9d8b31abac54c02b21b2d466edc
Successfully built seqeval
Installing collected packages: seqeval, simpletransformers
Successfully installed seqeval-0.0.12 simpletransformers-0.42.0


In [11]:
import numpy as np
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.metrics import *
from sklearn.model_selection import *

from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax

## Load the Dataset

In [12]:
train=pd.read_csv('../input/Train.csv')
test=pd.read_csv('../input/Test.csv')
sample=pd.read_csv('../input/SampleSubmission.csv')

In [13]:
train.columns

Index(['ID', 'text', 'label'], dtype='object')

Things to note

- ID is the unique identifier for each text.
- label is the column that contains the target variable (Depression , Drugs, Suicide, Alcohol)
- tweet is the column that contains the text to be analyzed.

In [14]:
train.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


## Before we began analysis; lets merge all dataset together

In [15]:
data=pd.concat([train,test],sort=False).reset_index(drop=True)

# DATA ANALYSIS

In [16]:
#Lets write a simple fxn to generate random color for our plots 
def random_colors(no_of_colors):
    colors = []
    for i in range(no_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [17]:
#The size of the train and test data - ( Rows , Columns )
print (train.shape, test.shape, data.shape)

(616, 3) (309, 2) (925, 3)


#### Comment: Lets check for the uniqueness of the ID whether each ID appears once, twice or more; if it appears once that means such ID which depicts a person; how often such person intent occur when collecting the data.

In [18]:
print(train['ID'].nunique(), train.shape)

616 (616, 3)


#### Comment: Cool since we have 616 unique ID across all observations; definitely an ID appear once that is each text extracted was just for every single user (in one word a text data per person)

In [19]:
train['label'].unique()

array(['Depression', 'Drugs', 'Suicide', 'Alcohol'], dtype=object)

### Comments: there exist 4 labels to be predicted; whether a person's text is giving a relation to any of the above listed array, Lets account for variation of the label whether their is imbalance in the dataset or not

In [20]:
temp = train.groupby('label').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Reds')

Unnamed: 0,label,text
1,Depression,352
0,Alcohol,140
3,Suicide,66
2,Drugs,58


### A visualization would be better to communicate the result

In [21]:
fig = go.Figure(go.Funnelarea(
    text =temp.label,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

#### Comments:High Variation of Depression to Drugs and Sucide occurs in the data (Imbalance data)

# lets review words with different label

# Depression

In [22]:
train[train['label'] == 'Depression']['text'].values[:5]

array(['I feel that it was better I dieAm happy',
       'I am stresseed due to lack of financial support in school',
       'How could I be helped to go through the depression?',
       'What are the effects of depression to ones health',
       'Why is everything so hard to deal with in this life'],
      dtype=object)

# Drugs

In [23]:
train[train['label'] == 'Drugs']['text'].values[:5]

array(['Why do I get hallucinations?',
       'Is heaven open for us who smoke bhang?',
       'Does mediataton help stop weed addiction?',
       'What are the effects of smoking bhang?',
       'How could i stop using bhang'], dtype=object)

# Suicide

In [24]:
train[train['label'] == 'Suicide']['text'].values[:5]

array(['Why is life important?', 'Causes of suicide by youths',
       'How to deal and overcome bad situation',
       'Will Prayer help me out of sucidal thoughts.',
       'Is there self worth in life?'], dtype=object)

# Alcohol

In [25]:
train[train['label'] == 'Alcohol']['text'].values[:5]

array(['How to manage taking alcohol?',
       'How do i stop this? What do I do when life becomes unbearable?',
       'How can I stop using alcohol?',
       'Effect of alcohol both in the society and my body',
       'How will I stop? What addiction means.'], dtype=object)

## Let's have a close look at some of the text data we have.

In [26]:
for no, text in enumerate(data["text"][3:8]):
    print(no+1,".",text)

1 . Why is life important?
2 . How could I be helped to go through the depression?
3 . What are the effects of depression to ones health
4 . Why is everything so hard to deal with in this life
5 .  I feel emotionally overwhelmed 


### Comments: mere lookingg at this we can see some stop words, symbols etc; instead of querying the whole data for some noise we all rather handle all noise in the dataset

### Cleaning the Corpus
Now Before We Dive into extracting information out of words in text and selected text,let's first clean the data
By doing the following

1. Covert all text to lowercase, 
2. remove text in square brackets,
3. remove links and remove punctuation
4. remove words containing numbers.

In [27]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100
data['punct%'] = data['text'].apply(lambda x: count_punct(x))

In [28]:
#Clean text from noise
def clean_text(text):
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [29]:
data['text'] = data['text'].apply(lambda x:clean_text(x))

In [30]:
def stop_words(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['text_nostop_words'] = data['text'].apply(stop_words)

In [31]:
data.head()

Unnamed: 0,ID,text,label,punct%,text_nostop_words
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,0.0,"[feel, better, dieam, happy]"
1,9JDAGUV3,why do i get hallucinations,Drugs,4.2,"[get, hallucinations, ]"
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,0.0,"[stresseed, due, lack, financial, support, sch..."
3,6UY7DX6Q,why is life important,Suicide,5.3,"[life, important, ]"
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,2.4,"[could, helped, go, depression, ]"


In [32]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['text_lemmatized'] = data['text_nostop_words'].apply(lambda x: lemmatizing(x))

In [33]:
#Rejoin words - rejoin the meaningful stemmed words into a single string
def rejoin_words(row):
    my_list=row['text_lemmatized']
    joined_words =(" ".join(my_list))
    return joined_words
data['processed'] =data.apply(rejoin_words,axis=1)

In [34]:
data.head()

Unnamed: 0,ID,text,label,punct%,text_nostop_words,text_lemmatized,processed
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,0.0,"[feel, better, dieam, happy]","[feel, better, dieam, happy]",feel better dieam happy
1,9JDAGUV3,why do i get hallucinations,Drugs,4.2,"[get, hallucinations, ]","[get, hallucination, ]",get hallucination
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,0.0,"[stresseed, due, lack, financial, support, sch...","[stresseed, due, lack, financial, support, sch...",stresseed due lack financial support school
3,6UY7DX6Q,why is life important,Suicide,5.3,"[life, important, ]","[life, important, ]",life important
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,2.4,"[could, helped, go, depression, ]","[could, helped, go, depression, ]",could helped go depression


In [35]:
data['no_of_word_list'] = data['processed'].apply(lambda x:str(x).split())
top = Counter([item for sublist in data['no_of_word_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(10))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,feel,211
1,alcohol,129
2,life,78
3,depression,75
4,stop,75
5,like,61
6,feeling,46
7,taking,43
8,depressed,39
9,effect,38


In [36]:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Words')
fig.show()

In [37]:
def count_text(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['text_length'] = data['processed'].apply(lambda x: len(x) - x.count(" "))

In [38]:
#Exhaustive list of stopwords in the english language. We want to focus less on these so at some point will have to filter
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

In [39]:
#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0

#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [40]:
#Generate word frequency
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(STOP_WORDS, errors='ignore')
    
    return word_freq

In [41]:
word_freq = gen_freq(data.processed.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Negation present or not
data['any_neg'] = data.text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
data['is_question'] = data.text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
data['any_rare'] = data.processed.str.split().apply(lambda x: any_rare(x, rare_100))

In [42]:
data['is_question'].value_counts()

0    487
1    438
Name: is_question, dtype: int64

In [43]:
train=data[data.label.notnull()].reset_index(drop=True)
test=data[data.label.isna()].reset_index(drop=True)

In [44]:
train.head()

Unnamed: 0,ID,text,label,punct%,text_nostop_words,text_lemmatized,processed,no_of_word_list,text_length,any_neg,is_question,any_rare
0,SUAVK39Z,i feel that it was better i dieam happy,Depression,0.0,"[feel, better, dieam, happy]","[feel, better, dieam, happy]",feel better dieam happy,"[feel, better, dieam, happy]",20,0,0,0
1,9JDAGUV3,why do i get hallucinations,Drugs,4.2,"[get, hallucinations, ]","[get, hallucination, ]",get hallucination,"[get, hallucination]",16,0,1,0
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression,0.0,"[stresseed, due, lack, financial, support, sch...","[stresseed, due, lack, financial, support, sch...",stresseed due lack financial support school,"[stresseed, due, lack, financial, support, sch...",38,0,0,0
3,6UY7DX6Q,why is life important,Suicide,5.3,"[life, important, ]","[life, important, ]",life important,"[life, important]",13,0,1,0
4,FYC0FTFB,how could i be helped to go through the depres...,Depression,2.4,"[could, helped, go, depression, ]","[could, helped, go, depression, ]",could helped go depression,"[could, helped, go, depression]",23,0,1,0


In [45]:
Depression = train[train['label']=='Depression']
Alcohol = train[train['label']=='Alcohol']
Suicide = train[train['label']=='Suicide']
Drugs = train[train['label']=='Drugs']

In [46]:
#MosT common Depression words
top = Counter([item for sublist in Depression['no_of_word_list'] for item in sublist])
temp_depression = pd.DataFrame(top.most_common(10))
temp_depression.columns = ['Common_words','count']
temp_depression.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,feel,138
1,depression,51
2,like,37
3,life,28
4,feeling,27
5,depressed,27
6,sad,21
7,low,21
8,better,16
9,get,16


In [47]:
#MosT common Alcohol words
top = Counter([item for sublist in Alcohol['no_of_word_list'] for item in sublist])
temp_Alcohol = pd.DataFrame(top.most_common(10))
temp_Alcohol.columns = ['Common_words','count']
temp_Alcohol.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,alcohol,83
1,stop,41
2,taking,17
3,alcoholism,12
4,avoid,12
5,drinking,11
6,effect,10
7,life,8
8,using,7
9,addiction,7


In [48]:
#MosT common Suicide words
top = Counter([item for sublist in Suicide['no_of_word_list'] for item in sublist])
temp_Suicide = pd.DataFrame(top.most_common(10))
temp_Suicide.columns = ['Common_words','count']
temp_Suicide.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,suicide,18
1,life,12
2,thought,6
3,overcome,5
4,way,5
5,cause,4
6,help,4
7,suicidal,4
8,sought,4
9,assistance,4


In [49]:
#MosT common Drugs words
top = Counter([item for sublist in Drugs['no_of_word_list'] for item in sublist])
temp_Drugs = pd.DataFrame(top.most_common(10))
temp_Drugs.columns = ['Common_words','count']
temp_Drugs.style.background_gradient(cmap='Greens')

Unnamed: 0,Common_words,count
0,bhang,22
1,weed,14
2,stop,12
3,drug,8
4,effect,7
5,smoking,6
6,one,6
7,taking,5
8,smoke,4
9,avoid,4


We can see words like stop, life are common in all three segments; we rather focus on unique wordswith respect to each target...  **By Looking at the Unique Words of each sentiment,we now have much more clarity about the data,these unique words are very strong determiners of Sentiment of tweets**

In [50]:
raw_text = [word for word_list in train['no_of_word_list'] for word in word_list]

In [51]:
def words_unique(label,numwords,raw_words):

    allother = []
    for item in train[train.label != label]['no_of_word_list']:
        for word in item:
            allother .append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in train[train.label == label]['no_of_word_list']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

In [52]:
Unique_Depression_words= words_unique('Depression', 10, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Depression_words.style.background_gradient(cmap='Greens')

The top 20 unique words in Positive Tweets are:


Unnamed: 0,words,count
0,depression,51
1,depressed,27
2,sad,21
3,low,21
4,lonely,16
5,alone,15
6,world,10
7,hopeless,8
8,family,8
9,im,7


In [53]:
Unique_Alcohol_words= words_unique('Alcohol', 10, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Alcohol_words.style.background_gradient(cmap='Greens')

The top 20 unique words in Positive Tweets are:


Unnamed: 0,words,count
0,alcohol,83
1,alcoholism,12
2,drinking,11
3,body,4
4,drink,4
5,addicted,3
6,intake,3
7,habit,3
8,mean,2
9,behaviour,2


In [54]:
Unique_Drugs_words= words_unique('Drugs', 10, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Drugs_words.style.background_gradient(cmap='Greens')

The top 20 unique words in Positive Tweets are:


Unnamed: 0,words,count
0,bhang,22
1,weed,14
2,smoking,6
3,smoke,4
4,hallucination,2
5,heaven,1
6,open,1
7,u,1
8,mediataton,1
9,advantage,1


In [55]:
Unique_Suicide_words= words_unique('Suicide', 10, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Suicide_words.style.background_gradient(cmap='Greens')

The top 20 unique words in Positive Tweets are:


Unnamed: 0,words,count
0,sought,4
1,commit,2
2,prayer,1
3,sucidal,1
4,incident,1
5,measure,1
6,withdrawal,1
7,wronged,1
8,accept,1
9,reconcilliation,1


In [56]:
train =train[['ID', 'processed', 'label']]
test =test[['ID','processed']]

In [57]:
train.head()

Unnamed: 0,ID,processed,label
0,SUAVK39Z,feel better dieam happy,Depression
1,9JDAGUV3,get hallucination,Drugs
2,419WR1LQ,stresseed due lack financial support school,Depression
3,6UY7DX6Q,life important,Suicide
4,FYC0FTFB,could helped go depression,Depression


In [58]:
test.head()

Unnamed: 0,ID,processed
0,02V56KMO,overcome bad feeling emotion
1,03BMGTOK,feel like giving life
2,03LZVFM6,depressed feel like got strength continue
3,0EPULUM5,feel low especially since one talk
4,0GM4C5GD,successful drug addict


In [63]:
vect = CountVectorizer()
X_train_dtm=vect.fit_transform(train['processed'])
m=pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())
m

Unnamed: 0,abandoned,able,absent,abuse,academic,accept,add,addict,addicted,addiction,...,world,worldm,worried,worst,worth,would,wronged,yet,young,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
X_test_dtm=vect.transform(test['processed'])

In [61]:
y_train=train['label'].factorize()

In [None]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
import lightgbm as lgb
from lightgbm import LGBMClassifier
model1=xgb.XGBClassifier()
cv_results = cross_validate(model1, X_train_dtm, y_train[0], cv=16,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
import lightgbm as lgb
from lightgbm import LGBMClassifier
model1=xgb.XGBClassifier()
cv_results = cross_validate(model1, X_train_dtm, y_train[0], cv=16,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

NameError: name 'X_train_dtm' is not defined

In [55]:
tf_vect=TfidfVectorizer()
X_train_tdtm=tf_vect.fit_transform(train['processed'])
X_test_tdtm=tf_vect.transform(test['processed'])

In [56]:
model1=xgb.XGBClassifier(learning_rate=0.01, depth=31)
cv_results_tf = cross_validate(model1, X_train_tdtm, y_train[0], cv=15,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

Parameters: { depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost c

In [58]:
import scipy.sparse
X_train = scipy.sparse.hstack([X_train_dtm, X_train_tdtm])
model_comb=xgb.XGBClassifier()
cv_results_comb = cross_validate(model_comb, X_train, y_train[0], cv=13,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

In [59]:
X_test = scipy.sparse.hstack([X_test_dtm, X_test_tdtm])

In [60]:
model_comb.fit(X_train,y_train[0])
preds=model_comb.predict_proba(X_test)

In [61]:
sample['Depression']=preds[:,0]
sample['Alcohol']=preds[:,3]
sample['Suicide']=preds[:,2]
sample['Drugs']=preds[:,1]

In [62]:
sample.to_csv('hstackf.csv',index=False)

In [3]:
import pandas as pd
x=pd.read_csv('hstackf.csv')