In [1]:
#importing python modules/libraries
import pandas as panda
import numpy as nump
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [65]:
dataset=panda.read_csv("who_suicide.csv")
dataset.head(10)

Unnamed: 0,country,year,sex,age,suicides_no,population
0,Albania,1985,female,15-24 years,,277900.0
1,Albania,1985,female,25-34 years,,246800.0
2,Albania,1985,female,35-54 years,,267500.0
3,Albania,1985,female,5-14 years,,298300.0
4,Albania,1985,female,55-74 years,,138700.0
5,Albania,1985,female,75+ years,,34200.0
6,Albania,1985,male,15-24 years,,301400.0
7,Albania,1985,male,25-34 years,,264200.0
8,Albania,1985,male,35-54 years,,296700.0
9,Albania,1985,male,5-14 years,,325800.0


In [66]:
#finding total number of null values in attributes of dataset
print(dataset.isnull().sum())

country           0
year              0
sex               0
age               0
suicides_no    2256
population     5460
dtype: int64


In [67]:
print(dataset.count())


country        43776
year           43776
sex            43776
age            43776
suicides_no    41520
population     38316
dtype: int64


In [68]:
#removing null value rows from dataset and again checking number of null rows in dataset
dataset.dropna(inplace=True)
print(dataset.isnull().sum())

country        0
year           0
sex            0
age            0
suicides_no    0
population     0
dtype: int64


In [69]:
print(dataset.head(10))
print(dataset.count())

    country  year     sex          age  suicides_no  population
24  Albania  1987  female  15-24 years         14.0    289700.0
25  Albania  1987  female  25-34 years          4.0    257200.0
26  Albania  1987  female  35-54 years          6.0    278800.0
27  Albania  1987  female   5-14 years          0.0    311000.0
28  Albania  1987  female  55-74 years          0.0    144600.0
29  Albania  1987  female    75+ years          1.0     35600.0
30  Albania  1987    male  15-24 years         21.0    312900.0
31  Albania  1987    male  25-34 years          9.0    274300.0
32  Albania  1987    male  35-54 years         16.0    308000.0
33  Albania  1987    male   5-14 years          0.0    338200.0
country        36060
year           36060
sex            36060
age            36060
suicides_no    36060
population     36060
dtype: int64


In [70]:
#printing information of dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36060 entries, 24 to 43763
Data columns (total 6 columns):
country        36060 non-null object
year           36060 non-null int64
sex            36060 non-null object
age            36060 non-null object
suicides_no    36060 non-null float64
population     36060 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 1.9+ MB


In [71]:
#checking Age ranges
print(dataset.age.unique())

['15-24 years' '25-34 years' '35-54 years' '5-14 years' '55-74 years'
 '75+ years']


In [72]:
dataset.describe()

Unnamed: 0,year,suicides_no,population
count,36060.0,36060.0,36060.0
mean,1998.935441,221.809956,1699996.0
std,10.163883,855.449442,3697811.0
min,1979.0,0.0,259.0
25%,1991.0,2.0,80566.0
50%,2000.0,21.0,375765.0
75%,2008.0,116.0,1344900.0
max,2016.0,22338.0,43805210.0


In [76]:
#Preprocessing, Converting AGE Groups into integer values and Gender ~ Female=0,Male=1 for model testing and predictions
mydata=dataset
mydata=mydata.drop(['country'],axis=1)#removed country column from 'mydata'
#replacing age groups with integer values
mydata['age']=mydata['age'].replace('5-14 years',0)
mydata['age']=mydata['age'].replace('15-24 years',1)
mydata['age']=mydata['age'].replace('25-34 years',2)
mydata['age']=mydata['age'].replace('35-54 years',3)
mydata['age']=mydata['age'].replace('55-74 years',4)
mydata['age']=mydata['age'].replace('75+ years',5)
mydata['sex']=mydata['sex'].replace('female',0)
mydata['sex']=mydata['sex'].replace('male',1)

#adding two more columns for model training and testing

mydata['suicides/100k_population']=(mydata.suicides_no/mydata.population)/100000
mydata['fatality_rate']=nump.where(mydata['suicides/100k_population']>mydata['suicides/100k_population'].mean(),0,1)

In [86]:
#Dividing dataset into training and testing sets
X = nump.array(mydata.drop(['sex', 'suicides_no'], 1))
y=nump.array(mydata.age)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

from sklearn import utils
from sklearn import preprocessing
from sklearn import metrics
#Encoing labels for continous and multi class output
label_enc=preprocessing.LabelEncoder()
trs=label_enc.fit_transform(y_train)


print("Shape of x_train: ",X_train.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of x_test: ",X_test.shape)
print("Shape of y_test: ",y_test.shape)

Shape of x_train:  (27045, 5)
Shape of y_train:  (27045,)
Shape of x_test:  (9015, 5)
Shape of y_test:  (9015,)


In [98]:
#checking RandomForestClassifier Model and accuracy
from sklearn.ensemble import RandomForestClassifier
mod=RandomForestClassifier(n_estimators=100,random_state=42)
mod.fit(X_train,trs)
mod.predict(X_test)
print("Random Forest accuracy is: {:.2f}%".format(accuracy_score(y_test,mod.predict(X_test))*100))

Random Forest accuracy is: 100.00%


In [100]:
#Checking DecisionTreeClassifier Model and its accuracy
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(X_train,trs)
model.predict(X_test)

print("Decision Tree accuracy is : {:.2f}%".format(accuracy_score(y_test,model.predict(X_test))*100))

Decision Tree accuracy is : 100.00%


In [102]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier


model1=KNeighborsClassifier(n_neighbors=2)
model1.fit(X_train,trs)
model1.predict(X_test)

print("KNN Accuracy : {:.2f}%".format(accuracy_score(y_test,model1.predict(X_test))*100))

KNN Accuracy : 22.85%


In [90]:
from sklearn.metrics import classification_report

#classification report KNNClassifier

print(classification_report(model1.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.36      0.20      0.26      2709
           1       0.29      0.20      0.24      2156
           2       0.24      0.21      0.22      1697
           3       0.19      0.24      0.21      1236
           4       0.14      0.26      0.18       791
           5       0.15      0.50      0.23       426

    accuracy                           0.23      9015
   macro avg       0.23      0.27      0.22      9015
weighted avg       0.27      0.23      0.23      9015



In [91]:
from sklearn.metrics import classification_report

#classification report DecisionTreeClassifier

print(classification_report(mod.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1513
           1       1.00      1.00      1.00      1489
           2       1.00      1.00      1.00      1505
           3       1.00      1.00      1.00      1567
           4       1.00      1.00      1.00      1485
           5       1.00      1.00      1.00      1456

    accuracy                           1.00      9015
   macro avg       1.00      1.00      1.00      9015
weighted avg       1.00      1.00      1.00      9015



In [92]:
#classification report RandomForest

print(classification_report(model.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1513
           1       1.00      1.00      1.00      1489
           2       1.00      1.00      1.00      1505
           3       1.00      1.00      1.00      1567
           4       1.00      1.00      1.00      1485
           5       1.00      1.00      1.00      1456

    accuracy                           1.00      9015
   macro avg       1.00      1.00      1.00      9015
weighted avg       1.00      1.00      1.00      9015

