In [2]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split

### Load in data

In [5]:
# load in raw data
data = pd.read_csv("../data/raw_data/drug_consumption.data", header=None, delimiter=",")

In [6]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [7]:
# drop index column
data.drop(0, axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [9]:
data.rename(columns = {1: "age", 2: "gender", 3: "education", 4: "country", 5: "ethnicity", 
                      6: "neuroticism", 7:"extraversion", 8:"openness", 9:"agreeableness",
                      10: "conscienciousness", 11: "impulsiveness", 12: "sensation",
                      13: "alcohol", 14: "amphetamines", 15: "amyl", 16: "benzo", 
                      17: "caffeine", 18:"cannabis", 19:"chocolate", 20:"cocaine", 
                      21:"crack", 22:"ecstasy", 23:"heroin", 24:"ketamine", 25: "legalh",
                      26:"lsd", 27:"meth", 28:"mushrooms", 29:"nicotene", 30:"semer", 31:"vsa"}, inplace=True)

### Condense outcome columns
Label will be 1 if respondent has used a commonly-illegal drug in the past month, and 0 otherwise. 

In [10]:
# note: we should ignore 12 and higher (indexing starting at 0)
drug_use = pd.Series(0 for x in range(len(data)))
drug_dates = ["CL4", "CL5", "CL6"]
for index,row in data.iterrows():
    if row['ecstasy'] in drug_dates:
        drug_use[index] = 1
    elif row['heroin'] in drug_dates:
        drug_use[index] = 1
    elif row['crack'] in drug_dates:
        drug_use[index] = 1
    elif row['meth'] in drug_dates:
        drug_use[index] = 1
    elif row['lsd'] in drug_dates:
        drug_use[index] = 1
    elif row['cocaine'] in drug_dates:
        drug_use[index] = 1
    elif row['mushrooms'] in drug_dates:
        drug_use[index] = 1
    elif row['amphetamines'] in drug_dates:
        drug_use[index] = 1
    elif row["amyl"] in drug_dates:
        drug_use[index] = 1
    elif row["benzo"] in drug_dates:
        drug_use[index] = 1
    elif row["ketamine"] in drug_dates:
        drug_use[index] = 1

In [11]:
data['drug_use'] = drug_use

In [12]:
data.head()

Unnamed: 0,age,gender,education,country,ethnicity,neuroticism,extraversion,openness,agreeableness,conscienciousness,...,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotene,semer,vsa,drug_use
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0,0
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0,1
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0,0
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0,0
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0,0


In [13]:
# drop unnecessary columns
df_x = data[['age','gender','education','country','ethnicity','neuroticism','extraversion',
                  'openness', 'agreeableness', 'conscienciousness', 'impulsiveness', 'sensation']]

### Create train/test split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df_x, drug_use, test_size=0.33, random_state=42)

In [16]:
x_train["drug_use"] = y_train
x_test["drug_use"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train["drug_use"] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["drug_use"] = y_test


In [10]:
# save files
x_train.to_csv("../data/drug_consumption33.data",",",index=False, header=False)
x_test.to_csv("../data/drug_consumption33.test",",",index=False,header=False)

In [12]:
len(x_train)

1262

In [17]:
len(x_test)

623