# Data Cleaning

In [1]:
# imports and settings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

#### Step 1:
After importing all necessary libraries and packages, we imported the data frame following the directions from the uci databse website, which imports the dataset as a pandas dataframe.

In [2]:
# fetch dataset 
drug_consumption = fetch_ucirepo(id=373) 

#### Step 2: 
It created a data frame of features, which show the categorical data about the particpant. It also created a data frame of targets, which shows the data about the drug use for each participant. These data frames are displayed below. 

In [3]:
# data features
drug_consumption.data.features

Unnamed: 0,age,gender,education,country,ethnicity,nscore,escore,oscore,ascore,cscore,impuslive,ss
0,0.49788,0.48246,-0.05921,0.96082,0.12600,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,-1.37983,0.40148
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.63340,-0.45174,-0.30172,1.30612,-0.21712,-0.21575
...,...,...,...,...,...,...,...,...,...,...,...,...
1880,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,-1.19430,1.74091,1.88511,0.76096,-1.13788,0.88113,1.92173
1881,-0.95197,-0.48246,-0.61113,-0.57009,-0.31685,-0.24649,1.74091,0.58331,0.76096,-1.51840,0.88113,0.76540
1882,-0.07854,0.48246,0.45468,-0.57009,-0.31685,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,0.52975,-0.52593
1883,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.91093,-1.92173,0.29338,-1.62090,-2.57309,1.29221,1.22470


In [4]:
#data targets
drug_consumption.data.targets 

Unnamed: 0,alcohol,amphet,amyl,benzos,caff,cannabis,choc,coke,crack,ecstasy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,CL5,CL2,CL0,CL2,CL6,CL0,CL5,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,CL5,CL2,CL2,CL0,CL6,CL4,CL6,CL3,CL0,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,CL6,CL0,CL0,CL0,CL6,CL3,CL4,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,CL4,CL0,CL0,CL3,CL5,CL2,CL4,CL2,CL0,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,CL4,CL1,CL1,CL0,CL6,CL3,CL6,CL0,CL0,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,CL5,CL0,CL0,CL0,CL4,CL5,CL4,CL0,CL0,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1881,CL5,CL0,CL0,CL0,CL5,CL3,CL4,CL0,CL0,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1882,CL4,CL6,CL5,CL5,CL6,CL6,CL6,CL4,CL0,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1883,CL5,CL0,CL0,CL0,CL6,CL6,CL5,CL0,CL0,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


#### Step 3:
To see the variable information from the data frames, we loaded a data frame showing the variables for the data frames, which displays the type of each variable, along with the fact that there are no missing values for any of the categories.

In [5]:
# variable information 
print(drug_consumption.variables) 

         name     role         type      demographic description units  \
0          id       ID      Integer             None        None  None   
1         age  Feature   Continuous              Age        None  None   
2      gender  Feature   Continuous           Gender        None  None   
3   education  Feature   Continuous  Education Level        None  None   
4     country  Feature   Continuous      Nationality        None  None   
5   ethnicity  Feature   Continuous        Ethnicity        None  None   
6      nscore  Feature   Continuous             None        None  None   
7      escore  Feature   Continuous             None        None  None   
8      oscore  Feature   Continuous             None        None  None   
9      ascore  Feature   Continuous             None        None  None   
10     cscore  Feature   Continuous             None        None  None   
11  impuslive  Feature   Continuous             None        None  None   
12         ss  Feature   Continuous   

#### Step 4:
Since the personality column names were abbreviated, we decided to rename them to the full name of the personality aspect they were focusing on. This helps improve readability of our graphs. We also corrected a spelling error in impulsive.

In [6]:
drug_consumption.data.features = drug_consumption.data.features.rename(
    columns={'nscore': 'neuroticism', 'escore': 'extraversion',
           'oscore': 'openness', 'ascore': 'agreeableness',
            'cscore': 'conscientiousness', 'impuslive': 'impulsive',
             'ss': 'sensation-seeking'})
print(drug_consumption.data.features.columns)
print(drug_consumption.data.features)

Index(['age', 'gender', 'education', 'country', 'ethnicity', 'neuroticism',
       'extraversion', 'openness', 'agreeableness', 'conscientiousness',
       'impulsive', 'sensation-seeking'],
      dtype='object')
          age   gender  education  country  ethnicity  neuroticism  \
0     0.49788  0.48246   -0.05921  0.96082    0.12600      0.31287   
1    -0.07854 -0.48246    1.98437  0.96082   -0.31685     -0.67825   
2     0.49788 -0.48246   -0.05921  0.96082   -0.31685     -0.46725   
3    -0.95197  0.48246    1.16365  0.96082   -0.31685     -0.14882   
4     0.49788  0.48246    1.98437  0.96082   -0.31685      0.73545   
...       ...      ...        ...      ...        ...          ...   
1880 -0.95197  0.48246   -0.61113 -0.57009   -0.31685     -1.19430   
1881 -0.95197 -0.48246   -0.61113 -0.57009   -0.31685     -0.24649   
1882 -0.07854  0.48246    0.45468 -0.57009   -0.31685      1.13281   
1883 -0.95197  0.48246   -0.61113 -0.57009   -0.31685      0.91093   
1884 -0.95197 -0.

#### Step 5: 
Semer is a class of ficticious drug. We decided to delete the rows of the people that responded yes to semer, because it is a fake drug that the authors made up to check for people not being accurate in their responses. We believe this would not be useful in our analysis. We printed the amount of rows (length of the data frame) before and after removing these rows to see how this change affected the size.

In [7]:
print("Length before cleaning: ")
print(len(drug_consumption.data.targets))
drug_consumption.data.targets = drug_consumption.data.targets[
    drug_consumption.data.targets['semer'] == 'CL0'
]

print("Length after cleaning: ")
print(len(drug_consumption.data.targets))

Length before cleaning: 
1885
Length after cleaning: 
1877


#### Step 6: 
Next, we decided to convert the columns to a binary variable based on likelihood to do drugs. To do so, we chose a threshold to define the binary values. We decided that if the the individual did drugs in the last year, last month, last week, and last day, we would classify them as doing the given drug. This corresponds to classes 3-6 for each of the independent label variables given from the dataset. If the individual never used the drug, used it over a decade ago, or used in the last decade, we classified them as not doing the given drug. This corresponds to classes 0, 1, and 2 for each of the independent label variables given from the dataset. Using these parameters, we were able to turn the values into binary variables.

In [12]:
print("Unique target values before conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

# Convert responses into numerical values (1-5)
converted_targets = drug_consumption.data.targets.copy()
for column in drug_consumption.data.targets.columns:
    converted_targets[column] = converted_targets[column].apply(lambda x: int(x[2]))

drug_consumption.data.targets = converted_targets
#write numerical targets to csv to use to show some exploratory data analysis in phase 4
numerical_targets = pd.concat([drug_consumption.data.features, 
                           drug_consumption.data.targets], axis=1)
numerical_targets.to_csv('numerical_targets.csv', index=False)

# Print unique target values after numerical conversion
print("\nUnique target values after numerical conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

print(drug_consumption.data.targets)

# Convert numerical values into binary variables
binary_targets = converted_targets.copy()
threshold = 3  # Define the threshold for binary conversion
for column in binary_targets.columns:
    binary_targets[column] = binary_targets[column].apply(lambda x: 1 if x >= threshold else 0)

drug_consumption.data.targets = binary_targets

# Print unique target values after binary conversion
print("\nUnique target values after binary conversion:")
for column in drug_consumption.data.targets.columns:
    unique_values = drug_consumption.data.targets[column].unique()
    print(f"Unique values for {column}: {unique_values}")

print(drug_consumption.data.targets)

Unique target values before conversion:
Unique values for alcohol: [1 0]
Unique values for amphet: [0 1]
Unique values for amyl: [0 1]
Unique values for benzos: [0 1]
Unique values for caff: [1 0]
Unique values for cannabis: [0 1]
Unique values for choc: [1 0]
Unique values for coke: [0 1]
Unique values for crack: [0 1]
Unique values for ecstasy: [0 1]
Unique values for heroin: [0 1]
Unique values for ketamine: [0 1]
Unique values for legalh: [0 1]
Unique values for lsd: [0 1]
Unique values for meth: [0 1]
Unique values for mushrooms: [0 1]
Unique values for nicotine: [0 1]
Unique values for semer: [0]
Unique values for vsa: [0 1]
Unique values for stimulant: [0 1]
Unique values for extreme: [0 1]


TypeError: 'int' object is not subscriptable

#### Step 7:
Next, we decided to add columns for usage of stimulants and extreme drugs. These are the two categories of drugs we decided to focus on for our hypotheses. 

After our initial data exploration, we chose to focus on stimulants because this category seemed to have the most effect on certain personality traits, such as sensation-seeking. To chose which drugs are stimulants, we used this resource: *https://www.cpp.edu/health/health-topics/stimulants.shtml*. The list of stimulant drugs is: nicotine, cocaine, methamphetamine, amphetimine, and amyl nitrate. Although data for caffeine, a stimulant, is also given, we decided to exclude it from our grouping because of its widespread, socially normalized use and mild effects, which could skew results. 

To categorize our extreme drugs The list of extreme drugs:coke, crack, ecstasy, heroin, and ketamine. We included these drugs after researching and finding a list of the most dangerous drugs on *https://delamere.com/blog/top-10-most-dangerous-drugs*. 

In [9]:
# List of stimulant drug columns
stimulant_drugs = ['nicotine', 'coke', 'meth', 'amphet', 'amyl']

# Add a "stimulant" column: 1 if any of the stimulant drug columns have a 1, else 0
drug_consumption.data.targets['stimulant'] = drug_consumption.data.targets[stimulant_drugs].any(axis=1).astype(int)

# Print to verify the new column
print("\nUnique values in 'stimulant' column:")
print(drug_consumption.data.targets['stimulant'].unique())

# List of extreme drug columns
extreme_drugs = ['coke', 'crack', 'ecstasy', 'heroin', 'ketamine']

# Add a "extreme" column: 1 if any of the extreme drug columns have a 1, else 0
drug_consumption.data.targets['extreme'] = drug_consumption.data.targets[extreme_drugs].any(axis=1).astype(int)

# Print to verify the new column
print("\nUnique values in 'extreme' column:")
print(drug_consumption.data.targets['extreme'].unique())

print(drug_consumption.data.targets)


Unique values in 'stimulant' column:
[0 1]

Unique values in 'extreme' column:
[0 1]
      alcohol  amphet  amyl  benzos  caff  cannabis  choc  coke  crack  \
0           1       0     0       0     1         0     1     0      0   
1           1       0     0       0     1         1     1     1      0   
2           1       0     0       0     1         1     1     0      0   
3           1       0     0       1     1         0     1     0      0   
4           1       0     0       0     1         1     1     0      0   
...       ...     ...   ...     ...   ...       ...   ...   ...    ...   
1880        1       0     0       0     1         1     1     0      0   
1881        1       0     0       0     1         1     1     0      0   
1882        1       1     1       1     1         1     1     1      0   
1883        1       0     0       0     1         1     1     0      0   
1884        1       1     0       1     1         1     1     1      0   

      ecstasy  ...  ketam

#### Step 8:
We saved the cleaned data into a csv that we could use for the rest of our Phase 4.

In [10]:
drug_consumption.data.targets.to_csv('cleaned_targets.csv', index=False)
print("Cleaned data has been saved as 'cleaned_targets.csv'.")
drug_consumption.data.features.to_csv('cleaned_features.csv', index=False)
print("Cleaned data has been saved as 'cleaned_features.csv'.")

Cleaned data has been saved as 'cleaned_targets.csv'.
Cleaned data has been saved as 'cleaned_features.csv'.
