# Data pre-processing

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../drug_consumption.data', header=None)
df.columns =['ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS', 'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA'] 
df.set_index('ID', inplace = True)
df.head()

Unnamed: 0_level_0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


# Information captured by the features that we decided to keep

## General features:
- Age: age groups
- F: gender for women
- m: gender for mem
- Education: level of education

## Personnality features:
- Nscore: neuroticism (bad mood, bad feeling)
- Escore: extraversion (outgoing, talkative and energetic behavior)
- Oscore: openness to experience (fantasy, aesthetic sensitivity, attentiveness to inner feelings, preference for variety,  and intellectual curiosity)
- Ascore: agreeableness (kind, sympathetic, cooperative, warm, and considerate)
- Cscore: conscientiousness (do a task well, and take obligations to others seriously)
- Impulsive: impulsiveness (little or no forethought, reflection, or consideration of the consequences)
- SS: sensation seeking (search for experiences and feelings, that are "varied, novel, complex and intense")

## Drug consumption features:
-  Heroin: heroin consumption

## Encoding of the features
To give more sens to the algorithm

In [3]:
# for age
df['Age'] = round(df['Age'],3)
df['Age'] = df['Age'].replace(-0.952, 1) 
df['Age'] = df['Age'].replace(-0.079, 2) 
df['Age'] = df['Age'].replace(0.498, 3) 
df['Age'] = df['Age'].replace(1.094, 4) 
df['Age'] = df['Age'].replace(1.822, 5) 
df['Age'] = df['Age'].replace(2.592, 6) 
df['Age'] = df['Age'].astype(np.int)

In [4]:
# for gender
df['Gender'] = round(df['Gender'],3)
df['Gender'] = df['Gender'].replace(0.482, "F") 
df['Gender'] = df['Gender'].replace(-0.482, "M")
Gend = pd.get_dummies(df["Gender"])

In [5]:
# for education 
df['Education'] = round(df['Education'],3)
df['Education'] = df['Education'].replace(-2.436, 1) 
df['Education'] = df['Education'].replace(-1.738, 2) 
df['Education'] = df['Education'].replace(-1.437, 3) 
df['Education'] = df['Education'].replace(-1.228, 4) 
df['Education'] = df['Education'].replace(-0.611, 5) 
df['Education'] = df['Education'].replace(-0.059, 6) 
df['Education'] = df['Education'].replace(0.455, 7) 
df['Education'] = df['Education'].replace(1.164, 8) 
df['Education'] = df['Education'].replace(1.984, 9) 
df['Education'] = df['Education'].astype(np.int)

In [6]:
#personnality features

Nscore = df['Nscore'].unique()
Nscore.sort()
trueValue = np.arange(12, 61).tolist()
dictionary = dict(zip(Nscore, trueValue))
df['Nscore'] = df['Nscore'].replace(dictionary)
df['Nscore'] = df['Nscore'].astype(np.int)

Escore = df['Escore'].unique()
Escore.sort()
trueValue = [16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,58,59]
dictionary = dict(zip(Escore, trueValue))
df['Escore'] = df['Escore'].replace(dictionary)
df['Escore'] = df['Escore'].astype(np.int)

Oscore = df['Oscore'].unique()
Oscore.sort()
trueValue = [24,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60]
dictionary = dict(zip(Oscore, trueValue))
df['Oscore'] = df['Oscore'].replace(dictionary)
df['Oscore'] = df['Oscore'].astype(np.int)

Ascore = df['Ascore'].unique()
Ascore.sort()
trueValue = [12,16,18,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60]
dictionary = dict(zip(Ascore, trueValue))
df['Ascore'] = df['Ascore'].replace(dictionary)
df['Ascore'] = df['Ascore'].astype(np.int)

Cscore = df['Cscore'].unique()
Cscore.sort()
trueValue = [17,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,59]
dictionary = dict(zip(Cscore, trueValue))
df['Cscore'] = df['Cscore'].replace(dictionary)
df['Cscore'] = df['Cscore'].astype(np.int)

In [7]:
df.head()

Unnamed: 0_level_0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,F,6,0.96082,0.126,39,36,42,37,42,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,2,M,9,0.96082,-0.31685,29,52,55,48,41,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,3,M,6,0.96082,-0.31685,31,45,40,32,34,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,1,F,8,0.96082,-0.31685,34,34,46,47,46,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,3,F,9,0.96082,-0.31685,43,28,43,41,50,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [8]:
# change categorie to number : o if no drug consumption, 1 else
dict2 = {
    "CL0":0,"CL1":1,"CL2":1, "CL3":1, "CL4":1, "CL5":1, "CL6":1
}
drugs = [
    'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff','Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy',
    'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth','Mushrooms', 'Nicotine', 'Semer', 'VSA']

for drug in drugs:
    df[drug] = df[drug].replace(dict2)

# Remove not interesting columns for our case

In [9]:
del df['Country'] # not well distributed
del df['Ethnicity'] # same
del df["Ecstasy"]
del df['Ketamine']
del df['Legalh']
del df['LSD']
del df['Meth']
del df['Semer']
del df['VSA']
del df['Coke']
del df['Crack']
del df['Cannabis']
del df['Amyl']
del df['Benzos']
del df['Amphet']

In [10]:
del df['Alcohol']
del df['Caff']
del df['Choc']
del df['Nicotine']
del df['Mushrooms']
del df['Gender']

In [11]:
df = pd.concat([df,Gend], axis=1)


In [12]:
df.head()

Unnamed: 0_level_0,Age,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,Heroin,F,M
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,6,39,36,42,37,42,-0.21712,-1.18084,0,1,0
2,2,9,29,52,55,48,41,-0.71126,-0.21575,0,0,1
3,3,6,31,45,40,32,34,-1.37983,0.40148,0,0,1
4,1,8,34,34,46,47,46,-1.37983,-1.18084,0,1,0
5,3,9,43,28,43,41,50,-0.21712,-0.21575,0,1,0


In [13]:
df.to_pickle('df.pickle')