 ## ***Read and Understanding Dataset***

In [4]:
import pandas as pd

In [5]:
df_Train = pd.read_csv('Train.csv')
df_Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      616 non-null    object
 1   text    616 non-null    object
 2   label   616 non-null    object
dtypes: object(3)
memory usage: 14.6+ KB


In [6]:
df_Train.label.value_counts()

Depression    352
Alcohol       140
Suicide        66
Drugs          58
Name: label, dtype: int64

## ***********************Solving the problem of Imbalanced data***********************

In [7]:
Depression_count, Alcohol_count, Suicide_count, Drugs_count = df_Train['label'].value_counts()

Depression = df_Train[df_Train['label'] == 'Depression']
Alcohol = df_Train[df_Train['label'] == 'Alcohol']
Suicide = df_Train[df_Train['label'] == 'Suicide']
Drugs = df_Train[df_Train['label'] == 'Drugs']

### We try to over-sampling: Suicide, Drugs and Under-Sampling Depression

In [8]:
Depression_under = Depression.sample(Alcohol_count)
Suicide_under = Suicide.sample(Alcohol_count, replace=True)
Drugs_under = Drugs.sample(Alcohol_count, replace=True)

test1 = pd.concat([Depression_under, Alcohol], ignore_index=True, axis=0)
test2 = pd.concat([test1, Suicide_under], ignore_index=True, axis=0)
df_Train = pd.concat([test2, Drugs_under], ignore_index=True, axis=0)

df_Train.label.value_counts()

Depression    140
Alcohol       140
Suicide       140
Drugs         140
Name: label, dtype: int64

## ***********************Data Preparation ***********************

In [9]:
x = df_Train['text'].values.astype('U')
y = df_Train['label']

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)

In [12]:
print('data  shape :', x.shape)
print('label  shape :', len(y))

data  shape : (560, 599)
label  shape : 560


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

X_train.shape

(420, 599)

In [16]:
X_test.shape

(140, 599)

## ***********************Training Data***********************

In [17]:
from sklearn.svm import SVC 

In [18]:
svclassifier = SVC()
svclassifier.fit(X_train, y_train)

SVC()

In [19]:
svclassifier.classes_

array(['Alcohol', 'Depression', 'Drugs', 'Suicide'], dtype=object)

In [20]:
predict = svclassifier.predict(X_test)

## ***********************evaluation of the model***********************

In [21]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, predict))  
print(classification_report(y_test, predict)) 

[[26  3  2  2]
 [ 0 34  1  0]
 [ 0  1 31  0]
 [ 0  5  0 35]]
              precision    recall  f1-score   support

     Alcohol       1.00      0.79      0.88        33
  Depression       0.79      0.97      0.87        35
       Drugs       0.91      0.97      0.94        32
     Suicide       0.95      0.88      0.91        40

    accuracy                           0.90       140
   macro avg       0.91      0.90      0.90       140
weighted avg       0.91      0.90      0.90       140



## ***********************Testing***********************

In [23]:
data = pd.DataFrame({'ID': df_Train.ID[420:560],'Prediction': predict})

l = ['Alcohol', 'Depression', 'Drugs', 'Suicide']

In [24]:
data['Prediction'] = data.Prediction.astype('category')

Y = data.Prediction.cat.codes

data['Prediction'] = Y

In [None]:
from keras.utils import to_categorical