# Data Cleaning

In [1]:
# imports and settings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# fetch dataset 
drug_consumption = fetch_ucirepo(id=373) 
  
# data features
drug_consumption.data.features

Unnamed: 0,age,gender,education,country,ethnicity,nscore,escore,oscore,ascore,cscore,impuslive,ss
0,0.49788,0.48246,-0.05921,0.96082,0.12600,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,-1.37983,0.40148
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.63340,-0.45174,-0.30172,1.30612,-0.21712,-0.21575
...,...,...,...,...,...,...,...,...,...,...,...,...
1880,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,-1.19430,1.74091,1.88511,0.76096,-1.13788,0.88113,1.92173
1881,-0.95197,-0.48246,-0.61113,-0.57009,-0.31685,-0.24649,1.74091,0.58331,0.76096,-1.51840,0.88113,0.76540
1882,-0.07854,0.48246,0.45468,-0.57009,-0.31685,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,0.52975,-0.52593
1883,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.91093,-1.92173,0.29338,-1.62090,-2.57309,1.29221,1.22470


We imported the data frame following the directions from the uci databse website, which imports the dataset as a pandas dataframe. It created a data frame of features shown above, which show the categorical data about the particpant. It also created a data frame of targets shown below, which shows the data about the drug use for each participant.

In [3]:
#data targets
drug_consumption.data.targets 

Unnamed: 0,alcohol,amphet,amyl,benzos,caff,cannabis,choc,coke,crack,ecstasy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,CL5,CL2,CL0,CL2,CL6,CL0,CL5,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,CL5,CL2,CL2,CL0,CL6,CL4,CL6,CL3,CL0,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,CL6,CL0,CL0,CL0,CL6,CL3,CL4,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,CL4,CL0,CL0,CL3,CL5,CL2,CL4,CL2,CL0,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,CL4,CL1,CL1,CL0,CL6,CL3,CL6,CL0,CL0,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,CL5,CL0,CL0,CL0,CL4,CL5,CL4,CL0,CL0,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1881,CL5,CL0,CL0,CL0,CL5,CL3,CL4,CL0,CL0,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1882,CL4,CL6,CL5,CL5,CL6,CL6,CL6,CL4,CL0,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1883,CL5,CL0,CL0,CL0,CL6,CL6,CL5,CL0,CL0,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


Below, a data frame showing the variables for the data frame is shown, showing the type of each variable, along with the fact that there are no missing values for any of the categories

In [4]:
# variable information 
print(drug_consumption.variables) 

         name     role         type      demographic description units  \
0          id       ID      Integer             None        None  None   
1         age  Feature   Continuous              Age        None  None   
2      gender  Feature   Continuous           Gender        None  None   
3   education  Feature   Continuous  Education Level        None  None   
4     country  Feature   Continuous      Nationality        None  None   
5   ethnicity  Feature   Continuous        Ethnicity        None  None   
6      nscore  Feature   Continuous             None        None  None   
7      escore  Feature   Continuous             None        None  None   
8      oscore  Feature   Continuous             None        None  None   
9      ascore  Feature   Continuous             None        None  None   
10     cscore  Feature   Continuous             None        None  None   
11  impuslive  Feature   Continuous             None        None  None   
12         ss  Feature   Continuous   

We also decided to rename the personality columns to the full name of 
the personality aspect they were focusing on, so we could improve readability 
of our graphs. We also corrected a spelling error in impulsive.

In [6]:
drug_consumption.data.features = drug_consumption.data.features.rename(
    columns={'nscore': 'neuroticism', 'escore': 'extraversion',
           'oscore': 'openness', 'ascore': 'agreeableness',
            'cscore': 'conscientiousness', 'impuslive': 'impulsive',
             'ss': 'sensation-seeking'})
print(drug_consumption.data.features.columns)
print(drug_consumption.data.features)

Index(['age', 'gender', 'education', 'country', 'ethnicity', 'neuroticism',
       'extraversion', 'openness', 'agreeableness', 'conscientiousness',
       'impulsive', 'sensation-seeking'],
      dtype='object')
          age   gender  education  country  ethnicity  neuroticism  \
0     0.49788  0.48246   -0.05921  0.96082    0.12600      0.31287   
1    -0.07854 -0.48246    1.98437  0.96082   -0.31685     -0.67825   
2     0.49788 -0.48246   -0.05921  0.96082   -0.31685     -0.46725   
3    -0.95197  0.48246    1.16365  0.96082   -0.31685     -0.14882   
4     0.49788  0.48246    1.98437  0.96082   -0.31685      0.73545   
...       ...      ...        ...      ...        ...          ...   
1880 -0.95197  0.48246   -0.61113 -0.57009   -0.31685     -1.19430   
1881 -0.95197 -0.48246   -0.61113 -0.57009   -0.31685     -0.24649   
1882 -0.07854  0.48246    0.45468 -0.57009   -0.31685      1.13281   
1883 -0.95197  0.48246   -0.61113 -0.57009   -0.31685      0.91093   
1884 -0.95197 -0.

We decided to delete the rows of the people that responded yes to semer, because that was a fake drug that the authors made up to check for people not being accurate in their responses

In [7]:
print("Length before cleaning: ")
print(len(drug_consumption.data.targets))
drug_consumption.data.targets = drug_consumption.data.targets[
    drug_consumption.data.targets['semer'] == 'CL0'
]

print("Length after cleaning: ")
print(len(drug_consumption.data.targets))

Length before cleaning: 
1885
Length after cleaning: 
1877


In [None]:
Next, we decided to convert the columns to a binary variable based on likelihood to do drugs.

In [11]:
# Print unique target values before conversion
print("Unique target values before conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

# Convert responses into numerical values (1-5)
converted_targets = drug_consumption.data.targets.copy()
for column in drug_consumption.data.targets.columns:
    converted_targets[column] = converted_targets[column].apply(lambda x: int(x[2]))

drug_consumption.data.targets = converted_targets

# Print unique target values after numerical conversion
print("\nUnique target values after numerical conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

print(drug_consumption.data.targets)

# Convert numerical values into binary variables
binary_targets = converted_targets.copy()
threshold = 3  # Define the threshold for binary conversion
for column in binary_targets.columns:
    binary_targets[column] = binary_targets[column].apply(lambda x: 1 if x >= threshold else 0)

drug_consumption.data.targets = binary_targets

# Print unique target values after binary conversion
print("\nUnique target values after binary conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

print(drug_consumption.data.targets)

Unique target values before conversion:
Unique values for alcohol: ['CL5' 'CL6' 'CL4' 'CL2' 'CL1' 'CL0' 'CL3']
Unique values for amphet: ['CL2' 'CL0' 'CL1' 'CL3' 'CL5' 'CL4' 'CL6']
Unique values for amyl: ['CL0' 'CL2' 'CL1' 'CL3' 'CL5' 'CL4' 'CL6']
Unique values for benzos: ['CL2' 'CL0' 'CL3' 'CL1' 'CL4' 'CL5' 'CL6']
Unique values for caff: ['CL6' 'CL5' 'CL4' 'CL3' 'CL0' 'CL1' 'CL2']
Unique values for cannabis: ['CL0' 'CL4' 'CL3' 'CL2' 'CL1' 'CL6' 'CL5']
Unique values for choc: ['CL5' 'CL6' 'CL4' 'CL0' 'CL3' 'CL2' 'CL1']
Unique values for coke: ['CL0' 'CL3' 'CL2' 'CL1' 'CL6' 'CL5' 'CL4']
Unique values for crack: ['CL0' 'CL1' 'CL2' 'CL3' 'CL5' 'CL4' 'CL6']
Unique values for ecstasy: ['CL0' 'CL4' 'CL1' 'CL3' 'CL2' 'CL6' 'CL5']
Unique values for heroin: ['CL0' 'CL1' 'CL2' 'CL3' 'CL5' 'CL6' 'CL4']
Unique values for ketamine: ['CL0' 'CL2' 'CL3' 'CL1' 'CL5' 'CL4' 'CL6']
Unique values for legalh: ['CL0' 'CL1' 'CL2' 'CL3' 'CL5' 'CL4' 'CL6']
Unique values for lsd: ['CL0' 'CL2' 'CL1' 'CL3' 'CL4'