# Processing data for EDA

In [483]:
import numpy as np
import pandas as pd

# Import visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


# Pretty display for notebooks
%matplotlib inline

pd.set_option("display.max_columns", None)

In [484]:
df = pd.read_excel("../../data/raw/drug_consumption.xls")

### Convert Drug values into interger

In [485]:
drugs = df.columns[13:]
for drug in drugs: 
    df[drug] = df[drug].str.replace('CL', '').astype('int')

### Change column values for better understanding

In [486]:
def gender(gend):
    if gend == 0.48246:
        return "0"
    else:
        return "1"

df['Gender'] = df['Gender'].apply(gender)

In [487]:
def age(a):
    if a == -0.95197:
        return '18-24'
    elif a == -0.07854:
        return '25-34'
    elif a == 0.49788:
        return '35-44'
    elif a == 1.09449:
        return '45-54'
    elif a == 1.82213:
        return '55-64'
    else:
        return '65+'
    
df['Age'] = df['Age'].apply(age)

In [488]:
def education(e):
    if e == -2.43591:
        return 'before_16'
    elif e == -1.73790:
        return 'at_16'
    elif e == -1.43719:
        return 'at_17'
    elif e == -1.22751:
        return 'at_18'
    elif e == -0.61113:
        return 'some_college'
    elif e == -0.05921:
        return 'diploma'
    elif e == 0.45468:
        return 'university_degree'
    elif e == 1.16365:
        return 'masters_degree'
    else:
        return 'doctorate_degree'
    
df['Education'] = df['Education'].apply(education)

In [489]:
def country(c):
    if c == -0.09765:
        return 'australia'
    elif c == 0.24923:
        return 'canada'
    elif c == -0.46841:
        return 'new_zealand'
    elif c == -0.28519:
        return 'other'
    elif c == 0.21128:
        return 'republic_of_ireland'
    elif c == 0.96082:
        return 'uk'
    elif c == -0.57009:
        return 'usa'

    
df['Country'] = df['Country'].apply(country)

In [490]:
def ethnicity_transform(e):
    if e==-0.50212:
        return "asian"
    elif e==-1.10702:
        return "black"
    elif e==1.90725:
        return "black-asian"
    elif e==0.12600:
        return "white-asian"
    elif e==-0.22166:
        return "white-black"
    elif e==0.11440:
        return "other"
    elif e==-0.31685:
        return "white"
    
df.Ethnicity = df.Ethnicity.apply(ethnicity_transform)

### Identify Over-claimer
Semer is a fictious drug to identify overclaimer. 

In [491]:
df.query('Semer == 0', inplace = True)

### Create bins for Drugs

In [492]:
drugs = df.columns[13:]
drugs_bin = [drug + '_bin' for drug in drugs]
def users(u):
    if u == 0 or u == 1:
        return 0
    else:
        return 1
    

In [493]:
for i in range(len(drugs)):
    df[drugs_bin[i]] = df[drugs[i]].apply(users)

### Delete Columns
* The rows with overclaimer are going to be deleted (only 8)
+ The row ID is not necessary

In [494]:
df.drop('ID', axis=1, inplace=True)

In [495]:
df.drop('Semer', axis=1, inplace=True)
df.drop('Semer_bin', axis=1, inplace=True)

### Save csv for EDA

In [500]:
data_eda = df.iloc[:, :48].to_csv('../../data/processed/data_eda.csv', index=False)